Add the ability to process datasets coming out of enumerate() (#71)

* Add neural network test. * New tests. * Change log message. * Add Dataset.repeat(). * Add log. * Add prefetch. * Add Dataset.take(). * Add tf.nn.softmax(). * Add the ability to process datasets coming out of enumerate().
ponder-lab · Jan 29, 2024 · 8efb6fc · 8efb6fc
1 parent 1b49431
commit 8efb6fc
Show file tree

Hide file tree

Showing 4 changed files with 297 additions and 16 deletions.
diff --git a/...wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflowModel.java b/...wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflowModel.java
@@ -225,6 +225,10 @@ public void testTf2()
     testTf2("tensorflow_gan_tutorial.py", "train_step", 1, 10, 2);
     testTf2("tensorflow_gan_tutorial2.py", "train_step", 1, 10, 2);
     testTf2("tensorflow_eager_execution.py", "MyModel.call", 1, 5, 3);
+    testTf2("neural_network.py", "NeuralNet.call", 1, 9, 3);
+    testTf2("neural_network.py", "cross_entropy_loss", 4, 9, 2, 3);
+    testTf2("neural_network.py", "run_optimization", 2, 9, 2, 3);
+    testTf2("neural_network.py", "accuracy", 3, 9, 2, 3);
   }
 
   private void testTf2(

diff --git a/com.ibm.wala.cast.python.ml/data/tensorflow.xml b/com.ibm.wala.cast.python.ml/data/tensorflow.xml
@@ -67,6 +67,8 @@
         <putfield class="LRoot" field="conv2d" fieldType="LRoot" ref="layers" value="conv2d" />
         <new def="conv3d" class="Ltensorflow/functions/conv3d" />
         <putfield class="LRoot" field="conv3d" fieldType="LRoot" ref="nn" value="conv3d" />
+        <new def="softmax" class="Ltensorflow/functions/softmax" />
+        <putfield class="LRoot" field="softmax" fieldType="LRoot" ref="nn" value="softmax" />
         <new def="placeholder" class="Ltensorflow/functions/placeholder" />
         <putfield class="LRoot" field="placeholder" fieldType="LRoot" ref="x" value="placeholder" />
         <new def="examples" class="Lobject" />
@@ -631,6 +633,17 @@
           <return value="x" />
         </method>
       </class>
+
+      <class name="softmax" allocatable="true">
+        <method name="read_data" descriptor="()LRoot;">
+          <new def="x" class="Ltensorflow/functions/softmax" />
+          <return value="x" />
+        </method>
+        <method name="do" descriptor="()LRoot;" numArgs="4" paramNames="self logits axis name">
+          <call class="LRoot" name="read_data" descriptor="()LRoot;" type="virtual" arg0="arg0" def="x" />
+          <return value="x" />
+        </method>
+      </class>
     </package>
     <package name="tensorflow/estimator">
       <class name="Estimator" allocatable="true">
@@ -658,6 +671,12 @@
           <putfield class="LRoot" field="shuffle" fieldType="LRoot" ref="arg0" value="shuffle" />
           <new def="batch" class="Ltensorflow/data/batch" />
           <putfield class="LRoot" field="batch" fieldType="LRoot" ref="arg0" value="batch" />
+          <new def="repeat" class="Ltensorflow/data/repeat" />
+          <putfield class="LRoot" field="repeat" fieldType="LRoot" ref="arg0" value="repeat" />
+          <new def="prefetch" class="Ltensorflow/data/prefetch" />
+          <putfield class="LRoot" field="prefetch" fieldType="LRoot" ref="arg0" value="prefetch" />
+          <new def="take" class="Ltensorflow/data/take" />
+          <putfield class="LRoot" field="take" fieldType="LRoot" ref="arg0" value="take" />
           <return value="arg0" />
         </method>
         <method name="do" descriptor="()LRoot;" numArgs="2" paramNames="self variant_tensor">
@@ -683,6 +702,36 @@
           <return value="xx" />
         </method>
       </class>
+
+      <class name="repeat" allocatable="true">
+        <!-- https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/data/Dataset#repeat -->
+        <method name="do" descriptor="()LRoot;" numArgs="6" paramNames="self count name">
+          <!-- FIXME: Workaround for https://github.com/wala/ML/issues/127. -->
+          <new def="x" class="Ltensorflow/data/Dataset" />
+          <call class="Ltensorflow/data/Dataset" name="read_dataset" descriptor="()LRoot;" type="virtual" arg0="x" def="xx" />
+          <return value="xx" />
+        </method>
+      </class>
+
+      <class name="prefetch" allocatable="true">
+        <!-- https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/data/Dataset#prefetch -->
+        <method name="do" descriptor="()LRoot;" numArgs="3" paramNames="self buffer_size name">
+          <!-- FIXME: Workaround for https://github.com/wala/ML/issues/127. -->
+          <new def="x" class="Ltensorflow/data/Dataset" />
+          <call class="Ltensorflow/data/Dataset" name="read_dataset" descriptor="()LRoot;" type="virtual" arg0="x" def="xx" />
+          <return value="xx" />
+        </method>
+      </class>
+
+      <class name="take" allocatable="true">
+        <!-- https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/data/Dataset#take -->
+        <method name="do" descriptor="()LRoot;" numArgs="3" paramNames="self count name">
+          <!-- FIXME: Workaround for https://github.com/wala/ML/issues/127. -->
+          <new def="x" class="Ltensorflow/data/Dataset" />
+          <call class="Ltensorflow/data/Dataset" name="read_dataset" descriptor="()LRoot;" type="virtual" arg0="x" def="xx" />
+          <return value="xx" />
+        </method>
+      </class>
     </package>
     <package name="tensorflow/data/Dataset">
       <class name="from_tensor_slices" allocatable="true">

diff --git a/....cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java b/....cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java
@@ -168,33 +168,76 @@ private static Set<PointsToSetVariable> getDataflowSources(
     return sources;
   }
 
-  private static void processInstruction(
+  /**
+   * Processes the given {@link SSAInstruction} to decide if the given {@link PointsToSetVariable}
+   * is added to the given {@link Set} of {@link PointsToSetVariable}s as tensor dataflow sources.
+   *
+   * @param instruction The {@link SSAInstruction} to process.
+   * @param du The {@link DefUse} corresponding to the siven {@link SSAInstruction}.
+   * @param node The {@link CGNode} containing the given {@link SSAInstruction}.
+   * @param src The {@link PointsToSetVariable} under question as to whether it shoudl be considered
+   *     a tensor dataflow source.
+   * @param sources The {@link Set} of tensor dataflow sources.
+   * @param callGraph The {@link CallGraph} containing the given {@link SSAInstruction}.
+   * @param pointerAnalysis The {@link PointerAnalysis} corresponding to the given {@link
+   *     CallGraph}.
+   * @return True iff the given {@link PointsToSetVariable} was added to the given {@link Set} of
+   *     {@link PointsToSetVariable} dataflow sources.
+   */
+  private static boolean processInstruction(
       SSAInstruction instruction,
       DefUse du,
-      CGNode localPointerKeyNode,
+      CGNode node,
       PointsToSetVariable src,
       Set<PointsToSetVariable> sources,
       CallGraph callGraph,
       PointerAnalysis<InstanceKey> pointerAnalysis) {
     logger.fine(() -> "Processing instruction: " + instruction + ".");
 
-    int use = instruction.getUse(0);
-    SSAInstruction def = du.getDef(use);
+    if (instruction != null && instruction.getNumberOfUses() > 0) {
+      int use = instruction.getUse(0);
+      SSAInstruction def = du.getDef(use);
+
+      // First try intraprocedural analysis.
+      if (definesTensorIterable(def, node, callGraph, pointerAnalysis)) {
+        sources.add(src);
+        logger.info("Added dataflow source from tensor iterable: " + src + ".");
+        return true;
+      } else {
+        // Use interprocedural analysis using the PA.
+        boolean added =
+            processInstructionInterprocedurally(
+                instruction, use, node, src, sources, pointerAnalysis);
 
-    if (def == null) {
-      // definition is unavailable from the local DefUse. Use interprocedural analysis using the PA.
-      processInstructionInterprocedurally(
-          instruction, use, localPointerKeyNode, src, sources, pointerAnalysis);
-    } else if (definesTensorIterable(def, localPointerKeyNode, callGraph, pointerAnalysis)) {
-      sources.add(src);
-      logger.info("Added dataflow source from tensor iterable: " + src + ".");
+        if (added) return true;
+        else
+          // keep going up.
+          return processInstruction(def, du, node, src, sources, callGraph, pointerAnalysis);
+      }
     }
+
+    return false;
   }
 
-  private static void processInstructionInterprocedurally(
+  /**
+   * Similar to processInstruction but does so using the given {@link PointerAnalysis}.
+   *
+   * @param instruction The {@link SSAInstruction} to be processed.
+   * @param use The {@link DefUse} corresponding to the given {@link SSAInstruction}.
+   * @param node The {@link CGNode} containing the given {@link SSAInstruction}.
+   * @param src The {@link PointsToSetVariable} being decided upon whether it should be considered
+   *     as a tensor dataflow source.
+   * @param sources The {@link Set} of all tensor dataflow sources, i.e., {@link
+   *     PointsToSetVariable}s.
+   * @param pointerAnalysis The {@link PointerAnalysis} built from the given {@link CGNode}'s {@link
+   *     CallGraph}.
+   * @return True iff the given {@link PointsToSetVariable} was added to the given set of tensor
+   *     dataflow sources, i.e., the given {@link Set} of {@link PointsToSetVariable}s.
+   */
+  private static boolean processInstructionInterprocedurally(
       SSAInstruction instruction,
       int use,
-      CGNode localPointerKeyNode,
+      CGNode node,
       PointsToSetVariable src,
       Set<PointsToSetVariable> sources,
       PointerAnalysis<InstanceKey> pointerAnalysis) {
@@ -207,8 +250,7 @@ private static void processInstructionInterprocedurally(
                 + ".");
 
     // Look up the use in the pointer analysis to see if it points to a dataset.
-    PointerKey usePointerKey =
-        pointerAnalysis.getHeapModel().getPointerKeyForLocal(localPointerKeyNode, use);
+    PointerKey usePointerKey = pointerAnalysis.getHeapModel().getPointerKeyForLocal(node, use);
 
     for (InstanceKey ik : pointerAnalysis.getPointsToSet(usePointerKey)) {
       if (ik instanceof AllocationSiteInNode) {
@@ -219,10 +261,12 @@ private static void processInstructionInterprocedurally(
         if (reference.equals(DATASET)) {
           sources.add(src);
           logger.info("Added dataflow source from tensor dataset: " + src + ".");
-          break;
+          return true;
         }
       }
     }
+
+    return false;
   }
 
   /**

diff --git a/com.ibm.wala.cast.python.test/data/neural_network.py b/com.ibm.wala.cast.python.test/data/neural_network.py
@@ -0,0 +1,184 @@
+# From https://github.com/aymericdamien/TensorFlow-Examples/blob/6dcbe14649163814e72a22a999f20c5e247ce988/tensorflow_v2/notebooks/3_NeuralNetworks/neural_network.ipynb.
+
+# %%
+# # Neural Network Example
+
+# Build a 2-hidden layers fully connected neural network (a.k.a multilayer perceptron) with TensorFlow v2.
+
+# This example is using a low-level approach to better understand all mechanics behind building neural networks and the training process.
+
+# - Author: Aymeric Damien
+# - Project: https://github.com/aymericdamien/TensorFlow-Examples/
+# """
+
+# %%
+# ## Neural Network Overview
+
+# <img src="http://cs231n.github.io/assets/nn1/neural_net2.jpeg" alt="nn" style="width: 400px;"/>
+
+# ## MNIST Dataset Overview
+
+# This example is using MNIST handwritten digits. The dataset contains 60,000 examples for training and 10,000 examples for testing. The digits have been size-normalized and centered in a fixed-size image (28x28 pixels) with values from 0 to 255.
+
+# In this example, each image will be converted to float32, normalized to [0, 1] and flattened to a 1-D array of 784 features (28*28).
+
+# ![MNIST Dataset](http://neuralnetworksanddeeplearning.com/images/mnist_100_digits.png)
+
+# More info: http://yann.lecun.com/exdb/mnist/
+
+# %%
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+print("TensorFlow version:", tf.__version__)
+assert(tf.__version__ == "2.15.0")
+from tensorflow.keras import Model, layers
+import numpy as np
+import timeit
+
+start_time = timeit.default_timer()
+skipped_time = 0
+
+# %%
+# MNIST dataset parameters.
+num_classes = 10  # total classes (0-9 digits).
+num_features = 784  # data features (img shape: 28*28).
+
+# Training parameters.
+learning_rate = 0.1
+training_steps = 20000
+batch_size = 256
+display_step = 100
+
+# Network parameters.
+n_hidden_1 = 128  # 1st layer number of neurons.
+n_hidden_2 = 256  # 2nd layer number of neurons.
+
+# %%
+# Prepare MNIST data.
+from tensorflow.keras.datasets import mnist
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+# Convert to float32.
+x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
+# Flatten images to 1-D vector of 784 features (28*28).
+x_train, x_test = x_train.reshape([-1, num_features]), x_test.reshape([-1, num_features])
+# Normalize images value from [0, 255] to [0, 1].
+x_train, x_test = x_train / 255., x_test / 255.
+
+# %%
+# Use tf.data API to shuffle and batch data.
+train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)
+
+
+# %%
+# Create TF Model.
+class NeuralNet(Model):
+
+    # Set layers.
+    def __init__(self):
+        super(NeuralNet, self).__init__()
+        # First fully-connected hidden layer.
+        self.fc1 = layers.Dense(n_hidden_1, activation=tf.nn.relu)
+        # First fully-connected hidden layer.
+        self.fc2 = layers.Dense(n_hidden_2, activation=tf.nn.relu)
+        # Second fully-connecter hidden layer.
+        self.out = layers.Dense(num_classes)
+
+    # Set forward pass.
+    def call(self, x, is_training=False):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.out(x)
+        if not is_training:
+            # tf cross entropy expect logits without softmax, so only
+            # apply softmax when not training.
+            x = tf.nn.softmax(x)
+        return x
+
+
+# Build neural network model.
+neural_net = NeuralNet()
+
+
+# %%
+# Cross-Entropy Loss.
+# Note that this will apply 'softmax' to the logits.
+def cross_entropy_loss(x, y):
+    # Convert labels to int 64 for tf cross-entropy function.
+    y = tf.cast(y, tf.int64)
+    # Apply softmax to logits and compute cross-entropy.
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=x)
+    # Average loss across the batch.
+    return tf.reduce_mean(loss)
+
+
+# Accuracy metric.
+def accuracy(y_pred, y_true):
+    # Predicted class is the index of highest score in prediction vector (i.e. argmax).
+    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
+    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)
+
+
+# Stochastic gradient descent optimizer.
+optimizer = tf.optimizers.SGD(learning_rate)
+
+
+# %%
+# Optimization process.
+def run_optimization(x, y):
+    # Wrap computation inside a GradientTape for automatic differentiation.
+    with tf.GradientTape() as g:
+        # Forward pass.
+        pred = neural_net(x, is_training=True)
+        # Compute loss.
+        loss = cross_entropy_loss(pred, y)
+
+    # Variables to update, i.e. trainable variables.
+    trainable_variables = neural_net.trainable_variables
+
+    # Compute gradients.
+    gradients = g.gradient(loss, trainable_variables)
+
+    # Update W and b following gradients.
+    optimizer.apply_gradients(zip(gradients, trainable_variables))
+
+
+# %%
+# Run training for the given number of steps.
+for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
+    # Run the optimization to update W and b values.
+    run_optimization(batch_x, batch_y)
+
+    if step % display_step == 0:
+        pred = neural_net(batch_x, is_training=True)
+        loss = cross_entropy_loss(pred, batch_y)
+        acc = accuracy(pred, batch_y)
+        print_time = timeit.default_timer()
+        print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))
+        skipped_time += timeit.default_timer() - print_time
+
+# %%
+# Test model on validation set.
+pred = neural_net(x_test, is_training=False)
+print_time = timeit.default_timer()
+print("Test Accuracy: %f" % accuracy(pred, y_test))
+skipped_time += timeit.default_timer() - print_time
+
+# %%
+# Visualize predictions.
+import matplotlib.pyplot as plt
+
+# %%
+# Predict 5 images from validation set.
+n_images = 5
+test_images = x_test[:n_images]
+predictions = neural_net(test_images)
+
+print("Elapsed time: ", timeit.default_timer() - start_time - skipped_time)
+
+# Display image and model prediction.
+for i in range(n_images):
+    plt.imshow(np.reshape(test_images[i], [28, 28]), cmap='gray')
+    plt.show()
+    print("Model prediction: %i" % np.argmax(predictions.numpy()[i]))