diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index b38bfc12b5..fe2f69c500 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -881,6 +881,7 @@ INPUT                 += $(FF_HOME)/include
 INPUT                 += $(FF_HOME)/nmt
 INPUT                 += $(FF_HOME)/python
 INPUT                 += $(FF_HOME)/src
+INPUT                 += $(FF_HOME)/lib/substitutions/include
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/lib/substitutions/TUTORIAL.md b/lib/substitutions/TUTORIAL.md
new file mode 100644
index 0000000000..bcf39da603
--- /dev/null
+++ b/lib/substitutions/TUTORIAL.md
@@ -0,0 +1,206 @@
+## Tutorial of substitution lib with simple example
+
+#### Create a pattern
+
+```c++
+//we should specify both the node pattern and edge pattern when defining a GraphPattern 
+
+//first define an operator pattern for example, specify the node to have a linear 
+//operator
+OperatorPattern operator_pattern_n0{
+        std::vector<OperatorAttributeConstraint>{OperatorAttributeConstraint{
+            ConstraintType::EQUAL, OperatorAttributeKey::OP_TYPE, Op::LINEAR}}};
+
+//then define a tensor_pattern that restrict the pattern of edge in pcg. for example, 
+//specify that the first dimension (indexed by 0) of a tensor should be 2
+ParallelTensorPattern tensor_pattern_e0{
+    std::vector<TensorAttributeConstraint>{
+        TensorAttributeConstraint{ConstraintType::EQUAL,
+                                    ListIndexAccess<TensorAttributeKey>{
+                                        TensorAttributeKey::DIM_SIZES, 0},
+                                    2}}};
+/*
+remeber that both operator_pattern and tensor_pattern are std::vector, meaning that you 
+can define more than one constraint depending on the context
+*/
+```
+
+
+#### Pack into GraphPattern
+```c++
+//create a graph with node label of OperatorPattern and edge label of ParallelTensorPattern
+auto ig =
+    OutputLabelledOpenMultiDiGraph<OperatorPattern, ParallelTensorPattern>::
+        create<UnorderedOutputLabelledOpenMultiDiGraph<
+            OperatorPattern,
+            ParallelTensorPattern>>();
+//add constraints defined above as argument to create a node
+Node n0 = ig.add_node(operator_pattern_n0);
+//add port number to distinguish different edges going to the same node
+NodePort p0 = ig.add_node_port();
+//create edge
+InputMultiDiEdge e0{n0, p0, std::make_pair(p0.value(), p0.value())};
+ig.add_edge(e0);
+//add edge constraints above to the edge e0
+ig.add_label(e0, tensor_pattern_e0);
+
+//a pattern graph with one input edge pointing to a node
+/*
+        n0 (Linear)
+        ↑
+*/
+RC_ASSERT(get_nodes(ig).size() == 1);
+RC_ASSERT(get_edges(ig).size() == 1);
+```
+
+#### Define OutputGraph
+```cpp
+
+//define a 3-node PCG that can be applied from the input graph ig
+
+//Partition node that can partite the input into two parts
+OperatorAttrAssignment op_ass_n1{
+        {{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::REPARTITION}},
+         {OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
+         {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
+
+//Linear node
+OperatorAttrAssignment op_ass_n2{
+    {{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::LINEAR}},
+        {OperatorAttributeKey::OUT_CHANNELS,
+        OperatorAttrAccess{n0, OperatorAttributeKey::OUT_CHANNELS}},
+        {OperatorAttributeKey::USE_BIAS,
+        OperatorAttrAccess{n0, OperatorAttributeKey::USE_BIAS}},
+        {OperatorAttributeKey::DATA_TYPE,
+        OperatorAttrAccess{n0, OperatorAttributeKey::DATA_TYPE}},
+        {OperatorAttributeKey::ACTIVATION,
+        OperatorAttrAccess{n0, OperatorAttributeKey::ACTIVATION}},
+        {OperatorAttributeKey::REGULARIZER,
+        OperatorAttrAccess{n0, OperatorAttributeKey::REGULARIZER}}}};
+
+//Reduce node that will combine the result of two partitions
+OperatorAttrAssignment op_ass_n3{
+    {{OperatorAttributeKey::OP_TYPE, AttrConstant{Op::REDUCTION}},
+    {OperatorAttributeKey::PARALLEL_DIM, AttrConstant{ff_dim_t{0}}},
+    {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
+
+//notice that these assignments will be evaluated 
+//into new operators in the apply_substitution function 
+//and be inserted into the new pcg
+
+//create outputgraph with 3 nodes and 3 edges
+auto og = NodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>::create<
+    UnorderedNodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>>();
+Node n1 = og.add_node(op_ass_n1);
+Node n2 = og.add_node(op_ass_n2);
+Node n3 = og.add_node(op_ass_n3);
+NodePort p1 = og.add_node_port();
+NodePort p2 = og.add_node_port();
+NodePort p3 = og.add_node_port();
+
+InputMultiDiEdge e1{n1, p1, {p1.value(), p1.value()}};
+MultiDiEdge e2{n2, p2, n1, p1};
+MultiDiEdge e3{n3, p3, n2, p2};
+og.add_edge(e1);
+og.add_edge(e2);
+og.add_edge(e3);
+OutputGraphExpr output_graph_expr{og};
+
+/*
+The output graph looks like this
+               n3 (Reduce)
+               ↑
+               n2 (Linear)
+               ↑
+               n1 (Partition)
+               ↑
+*/
+RC_ASSERT(get_nodes(og).size() == 3);
+RC_ASSERT(get_edges(og).size() == 3);
+```
+
+#### Define substitution
+```cpp
+//define two dict that specify how the input and output edges are mapped in the substitution
+bidict<InputMultiDiEdge, InputMultiDiEdge> input_mapping;
+input_mapping.equate(e0, e1);
+bidict<OutputMultiDiEdge, OutputMultiDiEdge> output_mapping;
+
+Substitution substitution{
+    input_graph, output_graph_expr, input_mapping, output_mapping};
+```
+
+#### Apply substitution
+```cpp
+
+//create the target pcg that we want to apply for substitution
+SubParallelComputationGraph pcg =
+    OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>::create<
+        UnorderedOutputLabelledOpenMultiDiGraph<Operator,
+                                                ParallelTensor>>();
+
+Node n4 = pcg.add_node(Operator{InputAttrs{}, "input"});
+Node n5 = pcg.add_node(Operator{
+    LinearAttrs{1, false, DataType::FLOAT, Activation::RELU, std::nullopt},
+    "linear"});
+NodePort p4 = pcg.add_node_port();
+NodePort p5 = pcg.add_node_port();
+
+MultiDiEdge e4{n5, p5, n4, p4};
+pcg.add_edge(e4);
+pcg.add_label(e4,
+                ParallelTensor(ParallelTensorDims({2, 1}),
+                                DataType::FLOAT,
+                                CreateGrad::YES));
+
+/* Our target pcg looks like this
+           n5 (Linear)
+           ↑
+           n4 (input)
+*/
+
+//create criterion function that will test every predefined edge and node constraints
+MatchAdditionalCriterion criterion{
+    [&](Node const &pattern_node, Node const &graph_node) {
+        return operator_satisfies(pcg.at(graph_node),
+                                input_graph.value().at(pattern_node));
+    },
+    [&](OpenMultiDiEdge const &pattern_edge,
+        OpenMultiDiEdge const &graph_edge) {
+        return parallel_tensor_satisfies(
+            pcg.at(graph_edge), input_graph.value().at(pattern_edge));
+    }};
+
+RC_ASSERT(criterion.node_criterion(n0, n5));
+
+
+//find the match point that we can apply the substitution in the target pcg
+std::vector<MultiDiGraphPatternMatch> matches =
+    find_pattern_matches(input_graph, pcg, criterion);
+
+//there is only one match point in the pcg that we defined
+RC_ASSERT(matches.size() == 1);
+
+//apply substitution
+//the number of new pcg generated is bounded by O(2^(sn))where s is the number of
+//different substitutions and n is the number of nodes
+SubParallelComputationGraph new_pcg =
+    apply_substitution(pcg, substitution, matches[0]);
+
+//now the new pcg becomes as follow
+/*
+    n3 (Reduce)
+    ↑
+    n2 (Linear)
+    ↑
+    n1 (Partition)
+    ↑
+    n4 (Input)
+*/
+RC_ASSERT(get_nodes(new_pcg).size() == 4);
+RC_ASSERT(get_edges(new_pcg).size() == 3);
+```
+
+
+
+
diff --git a/lib/substitutions/include/substitutions/attribute_expr.h b/lib/substitutions/include/substitutions/attribute_expr.h
index 0afd48b431..a5c69649cf 100644
--- a/lib/substitutions/include/substitutions/attribute_expr.h
+++ b/lib/substitutions/include/substitutions/attribute_expr.h
@@ -7,20 +7,55 @@ namespace FlexFlow {
 
 enum class ConstraintType { EQUAL };
 
+/**
+ * @struct ListIndexAccess
+ * @brief Given the attribute key, retrieve the specific value stored at index i in the attribute
+ * This struct will be used in EvaluateOperatorAttributeExpr and EvaluateTensorAttributeExpr, 
+ * where we evaluate the expression and return the concrete value of the attribute stored at index i
+ */
 template <typename T>
 struct ListIndexAccess {
   T attribute_key;
   req<int> index;
 };
 
+/**
+ * @struct ListSize
+ * @brief Given the type of an attribute, retrieve the size of the attribute
+ * Specifically, for the OperatorAttributeValue, the size of the attribute is always MAX_TENSOR_DIM
+ * For the TensorAttributeValue, the size of the attribute is the size of the vector that represents 
+ * the specific attribute of tensor in PCG
+ */
 template <typename T>
 struct ListSize {
   req<T> attribute_key;
 };
 
+/**
+ * @struct AttributeExpr
+ * @brief AttributeExpr is a representation of ways to access the attribute.
+ * It can be a direct value, or a list index access, or a list size. 
+ * For example, padding of a Conv2D operator will be represented as a int, 
+ * and the dimension of a tensor will be represented as a vector to which
+ * we can access the vector size with ListSize and access the specific value 
+ * with ListIndexAccess
+ */
 template <typename T>
 using AttributeExpr = std::variant<T, ListIndexAccess<T>, ListSize<T>>;
 
+
+/**
+ * @struct AttributeConstraint
+ * @brief AttributeConstraint is additional constraint imposed when doing pattern matching other than 
+ * just matching graph topology. Specifically, given a pattern and a graph, matching solely the attribute 
+ * type is not enough as there are other factors to consider. For example, if we want to fuse two dense
+ * layer, we need to match the input shape; given a dense layer, we need to make sure the input shape matches 
+ * the output shape of the previous layer.
+ * 
+ * Given an attribute expression, attribute_expr should have a relationship with attribute_value defined by 
+ * constraint_type. Currently only EQUAL is supported, meaning that the attribute_expr should be equal to 
+ * attribute_value after evaluation.
+ */
 template <typename K, typename V>
 struct AttributeConstraint {
   ConstraintType constraint_type;
@@ -28,6 +63,11 @@ struct AttributeConstraint {
   V attribute_value;
 };
 
+
+/**
+ * @struct AttributePattern
+ * @brief AttributePattern is a collection of attribute constraints for pattern matching to satisfy.
+ */
 template <typename K, typename V>
 struct AttributePattern {
   std::vector<AttributeConstraint<K, V>> attribute_constraints;
diff --git a/lib/substitutions/include/substitutions/get_attribute.h b/lib/substitutions/include/substitutions/get_attribute.h
index 0e6dd4c69b..dbf1560986 100644
--- a/lib/substitutions/include/substitutions/get_attribute.h
+++ b/lib/substitutions/include/substitutions/get_attribute.h
@@ -5,6 +5,10 @@
 #include "operator_pattern.h"
 #include "utils/optional.h"
 
+
+/**
+ * @brief overloading get_attribute functions for different operator attributes.
+ */
 namespace FlexFlow {
 
 std::optional<OperatorAttributeValue> get_attribute(PCGOperatorAttrs const &,
diff --git a/lib/substitutions/include/substitutions/graph_pattern.h b/lib/substitutions/include/substitutions/graph_pattern.h
index 4f4021203b..6e0f839e28 100644
--- a/lib/substitutions/include/substitutions/graph_pattern.h
+++ b/lib/substitutions/include/substitutions/graph_pattern.h
@@ -8,6 +8,16 @@
 
 namespace FlexFlow {
 
+/**
+ * @struct GraphPattern
+ * @brief A GraphPattern is defined as an open graph with node label OperatorPattern 
+ * and output label ParallelTensorPattern, which is refered to as the pattern graph. 
+ * The graph structure of a GraphPattern instance defines the geometrical property 
+ * of the input graph, while the node labels and output labels define the attribute 
+ * property of that. To be detailed, the OperatorPattern and ParallelTensorPattern 
+ * contains a set of constraints and the corresponding graph needs to satisfy these 
+ * constraints in order to be considered as match.
+ */
 struct GraphPattern
     : public strong_typedef<
           GraphPattern,
@@ -16,15 +26,39 @@ struct GraphPattern
   using strong_typedef::strong_typedef;
 };
 
+/**
+ * @brief Given a pattern, split_pattern is used to split the pattern
+ * and recursively match the sub-patterns.
+ */
 GraphSplit split_pattern(OpenMultiDiGraphView const &pattern);
 
+/**
+ * @brief singleton_pattern is defined as a pattern that has only one node.
+ * A singleton pattern serves as the base case for recursive pattern matching.
+ */
 bool is_singleton_pattern(OpenMultiDiGraphView const &);
 
+/**
+ * @brief operator_satisfies checks if the operator satisfies the set of constraints.
+ * shown in the pattern.
+ */
 bool operator_satisfies(Operator const &params, OperatorPattern const &pattern);
 
+
+/**
+ * @brief parallel_tensor_satisfies checks if the parallel tensor satisfies the set of 
+ * constraints shown in the pattern.
+ */
 bool parallel_tensor_satisfies(ParallelTensor const &params,
                                ParallelTensorPattern const &pattern);
 
+/**
+ * @brief assignment_satifies checks if the provided MultiDiGraphPatternMatch is a valid
+ * description of how GraphPattern can be mapped to SubParallelComputationGraph.
+ * 
+ * It checkes if the node and edge assignments satisfy the constraints of the pattern and whether
+ * the graph topology matches.
+ */
 bool assignment_satisfies(SubParallelComputationGraph const &,
                           GraphPattern const &,
                           MultiDiGraphPatternMatch const &);
diff --git a/lib/substitutions/include/substitutions/graph_pattern_match.h b/lib/substitutions/include/substitutions/graph_pattern_match.h
index bf6d6b6921..baf3eae4c2 100644
--- a/lib/substitutions/include/substitutions/graph_pattern_match.h
+++ b/lib/substitutions/include/substitutions/graph_pattern_match.h
@@ -6,32 +6,83 @@
 
 namespace FlexFlow {
 
+/**
+ * @struct MultiDiGraphPatternMatch
+ * @brief MultiDiGraphPatternMatch describes a specific location in an OpenMultiDiGraph where a given pattern matches.
+ * 
+ * Given a graph and a pattern there can be zero, one, or multiple locations where it can match.
+ * 
+ * To provide some intuition, consider matching over strings instead of graphs: given a regex pattern "a.b" and a string "acbfadbga", there are two valid match locations: 
+ * we can either match the "acb" at the beginning of the string, or the "adb" in the middle of the string.
+ * MultiDiGraphPatternMatch represents the difference between the two possible locations using a bidict which maps between 
+ * objects in the pattern and the corresponding objects in the matched data structure. For example, in the string example above,
+ * the two matchings would be as follows:
+ * "acbfadbga"   "acbfadbga"
+ *  ^^^               ^^^
+ *  |||               |||
+ *  vvv               vvv
+ * "a.b"             "a.b"
+ * Of course in the context of graphs there are two types of objects to be matched: nodes and edges. 
+ * As such our match consists of not one but two bidict mappings: one for nodes (node_assignment) and one for edges (edge_assignment).
+ */
 struct MultiDiGraphPatternMatch {
   using PatternNode = Node;
   using PCGNode = Node;
+
+  /**
+   * @see OpenMultiDiEdge
+   */
   using PatternEdge = OpenMultiDiEdge;
   using PCGEdge = OpenMultiDiEdge;
 
+  /**
+   * @brief node_assignment describes the mapping between PatternNode and PCGNode as a part of the substitution.
+   */
   bidict<PatternNode, PCGNode> node_assignment;
+
+  /**
+   * @brief edge_assignment describes the mapping between PatternEdge and PCGEdge as a part of the substitution.
+   */
   bidict<PatternEdge, PCGEdge> edge_assignment;
 };
 
+/**
+ * @struct MatchSplit
+ * @brief MatchSplit is a struct that describes a split of a MultiDiGraphPatternMatch into 
+ * two sub MultiDiGraphPatternMatches by dividing the nodes into half. When applying pattern 
+ * matches, the pattern will be split into two parts and recursively matched against the graph.
+ */
 struct MatchSplit {
   MultiDiGraphPatternMatch prefix_submatch;
   MultiDiGraphPatternMatch postfix_submatch;
 };
 
+/**
+ * @struct MatchAdditionalCriterion
+ * @brief The additional conditions need to be satisfied other than geometric properties of the graph.
+ * Specifically as mentioned in attribute_expr.h, other than matching graph topology, we also need to make sure 
+ * the attributes(eg. shape of dense layer) should be matched as well. The additional constraints
+ * AttributeConstraint will be imposed inside node_criterion and edge_criterion for each potential match.
+ */
 struct MatchAdditionalCriterion {
   std::function<bool(Node const &, Node const &)> node_criterion;
   std::function<bool(OpenMultiDiEdge const &, OpenMultiDiEdge const &)>
       edge_criterion;
 };
 
+/**
+ * @brief pattern_matches checks if the pattern graph matches the graph with additional conditions defined 
+ * by additional_criterion. It is used as the last checking step to see if the pattern matches the graph 
+ * attributewise inside find_pattern_matches.
+ */
 bool pattern_matches(OpenMultiDiGraphView const &pattern,
                      OpenMultiDiGraphView const &graph,
                      MultiDiGraphPatternMatch const &match,
                      MatchAdditionalCriterion const &additional_criterion);
 
+/**
+ * @brief find_pattern_matches generate all valid matches from pattern to a subgraph of graph.
+ */ 
 std::vector<MultiDiGraphPatternMatch>
     find_pattern_matches(OpenMultiDiGraphView const &pattern,
                          OpenMultiDiGraphView const &graph,
diff --git a/lib/substitutions/include/substitutions/operator_pattern.h b/lib/substitutions/include/substitutions/operator_pattern.h
index 8fc4ebefc2..078a66a26d 100644
--- a/lib/substitutions/include/substitutions/operator_pattern.h
+++ b/lib/substitutions/include/substitutions/operator_pattern.h
@@ -11,6 +11,18 @@
 
 namespace FlexFlow {
 
+/**
+ * @enum OperatorAttributeKey
+ * @brief OperatorAttributeKey represents the keys of the attributes of an Operator.
+ * Specifically, each operator have a set of attributes, and each attribute will have 
+ * a key as its name and a concrete value representation.
+ * The OP_TYPE is a OperatorAttributeKey is a special attribute key that represents the 
+ * type of the Operator and will exist in every Operator. Given the OP_TYPE, the other 
+ * attributes will be determined accordingly.
+ * 
+ * For example, a batch matrix multiplication Operator will have OP_TYPE BATCH_MATMUL and 
+ * dimensions as A_SEQ_LENGTH_DIM and B_SEQ_LENGTH_DIM
+ */
 enum class OperatorAttributeKey {
   OP_TYPE, // AnyOp
   USE_BIAS,
@@ -70,6 +82,12 @@ enum class OperatorAttributeKey {
   NUM_INPUTS
 };
 
+
+/**
+ * @brief OperatorAttributeValue is a representation of the concrete value of an attribute of an Operator.
+ * The OperatorAttributeValue is evaluated from AttributeExpr. The datatype of the value corresponds to the 
+ * datatype of the attributekey listed in OperatorAttributeKey.
+ */
 using OperatorAttributeValue =
     std::variant<int,
                  float,
@@ -92,12 +110,23 @@ FF_VISITABLE_STRUCT(ListIndexAccess<FlexFlow::OperatorAttributeKey>,
                     index);
 FF_VISITABLE_STRUCT(ListSize<FlexFlow::OperatorAttributeKey>, attribute_key);
 
+/**
+ * @brief OperatorAttributeConstraint is an instance of template struct AttributeConstraint.
+ */
 using OperatorAttributeConstraint =
     AttributeConstraint<OperatorAttributeKey, OperatorAttributeValue>;
 
+/**
+ * @brief OperatorPattern is an instance of template struct AttributePattern.
+ */
 using OperatorPattern =
     AttributePattern<OperatorAttributeKey, OperatorAttributeValue>;
 
+/**
+ * @brief Given a specific attribute of an Operator, evaluate the expression of the attribute 
+ * using one of the three methods: direct value, list index access, or list size and return the
+ * value of the attribute.
+ */
 std::optional<OperatorAttributeValue>
     evaluate_attribute_expr(Operator const &attrs,
                             AttributeExpr<OperatorAttributeKey> const &expr);
diff --git a/lib/substitutions/include/substitutions/output_graph.h b/lib/substitutions/include/substitutions/output_graph.h
index 4ed90aed06..2d1651dc66 100644
--- a/lib/substitutions/include/substitutions/output_graph.h
+++ b/lib/substitutions/include/substitutions/output_graph.h
@@ -6,23 +6,53 @@
 namespace FlexFlow {
 
 // NOTE(@wmdi) I am not sure whether these should be part of attribute expr.
+
+/**
+ * @struct OperatorAttrAccess
+ * @brief OperatorAttrAccess consists of a node and an expression attr_expr 
+ * on the attributes of the operator associated with the node. The value of a 
+ * NodeAttrAccess instance is the value of attr_expr evaluated on the operator 
+ * associated with the node.
+ */
 struct OperatorAttrAccess {
   Node node;
   AttributeExpr<OperatorAttributeKey> attr_expr;
 };
 
+/**
+ * @struct AttrConstant
+ * @brief AttrConstant is a constant value that is used as an attribute expression.
+ */
 struct AttrConstant {
   OperatorAttributeValue value;
 };
 
+
+/**
+ * @brief OperatorAttributeExpr is a access to the attribute of an operator and can be
+ * evaluated to a concrete value. OperatorAttributeExpr is used at substitution phase. 
+ * It will be evaluated and used to create new operator with the evaluated value.
+ */
 using OperatorAttributeExpr = std::variant<OperatorAttrAccess, AttrConstant>;
 
-// NOTE(@wmdi): Not sure if it aligns with other design. Or alternatively we can
-// define the assignment for each operator type.
+/**
+ * @brief OperatorAttrAssignment is a collection of OperatorAttributeKey and 
+ * GraphAttributeExpr pairs for a single operator. It defines how the attributes 
+ * of a single operator is calculated from the input graph. A pair 
+ * {operator_attribute_key, graph_attribute_expr} in the collection means the value 
+ * of graph_attribute_expr is assigned to the attribute named operator_attribute_key 
+ * of the operator.
+ */
 struct OperatorAttrAssignment {
   std::unordered_map<OperatorAttributeKey, OperatorAttributeExpr> assignments;
 };
 
+/**
+ * @brief An OutputGraphExpr is defined as an open graph with node label 
+ * OperatorAttrAssignment and output label ParallelTensorAttrAssignment, which 
+ * defines how the operator attributes and the parallel tensor attributes of the 
+ * output graph are derived from the input graph.
+ */
 struct OutputGraphExpr
     : public strong_typedef<
           OutputGraphExpr,
diff --git a/lib/substitutions/include/substitutions/parallel_tensor_pattern.h b/lib/substitutions/include/substitutions/parallel_tensor_pattern.h
index 741554142f..612486fe09 100644
--- a/lib/substitutions/include/substitutions/parallel_tensor_pattern.h
+++ b/lib/substitutions/include/substitutions/parallel_tensor_pattern.h
@@ -6,16 +6,41 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief TensorAttributeKey is an enum class that represents the keys of the 
+ * attributes of a Tensor(matrix).
+ * DIM_SIZES describes the size of each dimension of the tensor for data parallelism computation
+ * DIM_DEGREES describes the number of partitions along each dimension of the tensor for data parallelism computation
+ */
 enum class TensorAttributeKey { DIM_SIZES, DIM_DEGREES };
 
+
+/**
+ * @brief DIM_SIZES and DIM_DEGREES are represented by 
+ * a vector of ints that is listed as corresponding dimension
+ */
 using TensorAttributeValue = std::variant<int, std::vector<int>>;
 
+/**
+ * @brief TensorAttributeConstraint is an instance of AttributeConstraint that
+ * defines the contraint a tensor should satisfy when doing pattern matching.
+ */
 using TensorAttributeConstraint =
     AttributeConstraint<TensorAttributeKey, TensorAttributeValue>;
 
+/**
+ * @brief ParallelTensor is an instance of OperatorAttributeExpr that represents
+ * a set of constraints pattern matching should satisfy.
+ */
 using ParallelTensorPattern =
     AttributePattern<TensorAttributeKey, TensorAttributeValue>;
 
+
+/**
+ * @brief evaluate_attribute_expr evaluates the attribute expression for a given ParallelTensor
+ * the ParallelTensor parameter is named tensor_shape because the numerical value will only be used
+ * in runtime. For the substitution phase, all that matters is the shape of the tensor.
+ */
 std::optional<TensorAttributeValue>
     evaluate_attribute_expr(ParallelTensor const &tensor_shape,
                             AttributeExpr<TensorAttributeKey> const &expr);
diff --git a/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h b/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h
index 0d6bfe7628..e5940007c8 100644
--- a/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h
+++ b/lib/substitutions/include/substitutions/sub_parallel_computation_graph.h
@@ -9,6 +9,13 @@
 
 namespace FlexFlow {
 
+/**
+ * @brief SubParallelComputationGraph is defined as an open graph, which allows nodes and edges 
+ * that are not from the same graph to be added to it.
+ * This definition is useful when we want to split and merge graphs when doing pattern matching.
+ * In contrast, the ParallelComputationGraph is defined as a closed graph and all the edges and 
+ * nodes are within that graph.
+ */
 using SubParallelComputationGraph =
     OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>;
 
diff --git a/lib/substitutions/include/substitutions/substitution.h b/lib/substitutions/include/substitutions/substitution.h
index 8dbe4e66cf..630f834e3d 100644
--- a/lib/substitutions/include/substitutions/substitution.h
+++ b/lib/substitutions/include/substitutions/substitution.h
@@ -7,6 +7,18 @@
 
 namespace FlexFlow {
 
+/**
+ * @struct Substitution
+ * @brief A substitution is to replace a subgraph of the PCG by a new one. 
+ * We refer to the subgraph to be replaced as the input graph, and the new 
+ * subgraph to replace the input graph as the output graph.
+ * A Substitution object describes a substitution. It consists of An 
+ * input_graph of type GraphPattern that describes which kind of input graphs 
+ * the substitution can be applied to; An output_graph of type OutputGraphExpr 
+ * that describes how the output graph is computed from the input graph; and
+ * An input_mapping and output_maping that describes how the output graph is 
+ * connected to the original PCG.
+ */
 struct Substitution {
   using InputPatternInput = InputMultiDiEdge;
   using InputPatternOutput = OutputMultiDiEdge;
@@ -19,8 +31,14 @@ struct Substitution {
   bidict<InputPatternOutput, OutputPatternOutput> output_mapping;
 };
 
+/**
+ * @brief is_valid_substitution checks if the substitution is valid. 
+ * The implementation will enumerate all the possible substitutions and filter 
+ * out all the invalid ones.
+ */
 bool is_valid_substitution(Substitution const &);
 
+
 SubParallelComputationGraph
     apply_substitution(SubParallelComputationGraph const &,
                        Substitution const &,