From 9fdb6f22f2842fdb9d9196497800e34a8dd8de92 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Mon, 6 Jan 2025 14:40:54 +0100
Subject: [PATCH] More cleanup + restructuring

---
 notebooks/Alhazen.ipynb | 336 +++++++++++++++++++++++++---------------
 1 file changed, 209 insertions(+), 127 deletions(-)
diff --git a/notebooks/Alhazen.ipynb b/notebooks/Alhazen.ipynb
index d7b7188c..35adb5d0 100644
--- a/notebooks/Alhazen.ipynb
+++ b/notebooks/Alhazen.ipynb
@@ -147,6 +147,14 @@
     "## Motivation"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Info:** We use the functionality provided by [The Fuzzing Book](https://www.fuzzingbook.org).\n",
+    "For a more detailed description of Grammars, have a look at the chapter [\"Fuzzing with Grammars\"](https://www.fuzzingbook.org/html/Grammars.html)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -203,15 +211,6 @@
     "We see that the `CALCULATOR` Grammar consists of several production rules. The calculator subject will only accept inputs that conform to this grammar definition."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-info\">\n",
-    "[Info]: We use the functionallity provided by <a href=\"https://www.fuzzingbook.org\">The Fuzzingbook</a>. For a more detailed description of Grammars, have a look at the chapter <a href=\"https://www.fuzzingbook.org/html/Grammars.html\">Fuzzing with Grammars</a>.\n",
-    "</div>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -401,15 +400,6 @@
     "The function `sample_runner(sample)` returns an `OracleResult.UNDEF` whenever the runner is not able to execute the sample."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-danger\" role=\"alert\">\n",
-    "To work reliably, you have to remove all samples from the learning set of Alhazen that do not conform to the grammar. \n",
-    "</div>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -423,21 +413,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import uuid\n",
-    "\n",
-    "# executes a list of samples and return the execution outcome (label)\n",
-    "# the functions returns a pandas dataframe\n",
+    "# Executes a list of samples and return the execution outcome (label)\n",
+    "# The function returns a pandas dataframe\n",
     "def execute_samples(sample_list):\n",
     "    data = []\n",
     "    for sample in sample_list:\n",
-    "        id = uuid.uuid1()\n",
     "        result = sample_runner(sample)\n",
-    "        data.append({\n",
-    "                # \"sample_id\": id.hex,\n",
-    "                # \"sample\": sample,\n",
-    "                # \"subject\": SUBJECT,\n",
-    "                \"oracle\": result\n",
-    "        })\n",
+    "        data.append({\"oracle\": result })\n",
+    "\n",
     "    return pandas.DataFrame.from_records(data)"
    ]
   },
@@ -555,16 +538,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<div class=\"alert alert-info\">\n",
-    "[Info]: <i>Alhazen</i> is a tool that automatically learns the circumstances of program failure by associating syntactical features of sample inputs with the execution outcome. The produced explanations (in the form of a decision tree) help developers focus on the input space's relevant aspects.\n",
-    "</div>"
+    "## The Alhazen Algorithm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "_Alhazen_ is a tool that automatically learns the circumstances of program failure by associating syntactical features of sample inputs with the execution outcome. The produced explanations (in the form of a decision tree) help developers focus on the input space's relevant aspects."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Features"
+    "## Step 1: Extracting Features"
    ]
   },
   {
@@ -760,7 +748,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def extract_existence(grammar: Grammar) -> List[ExistenceFeature]:\n",
+    "def extract_existence_features(grammar: Grammar) -> List[ExistenceFeature]:\n",
     "    '''\n",
     "        Extracts all existence features from the grammar and returns them as a list.\n",
     "        grammar : The input grammar.\n",
@@ -794,7 +782,7 @@
     "# Regex for non-terminal symbols in expansions\n",
     "RE_NONTERMINAL = re.compile(r'(<[^<> ]*>)')\n",
     "\n",
-    "def extract_numeric(grammar: Grammar) -> List[NumericInterpretation]:\n",
+    "def extract_numeric_features(grammar: Grammar) -> List[NumericInterpretation]:\n",
     "    '''\n",
     "        Extracts all numeric interpretation features from the grammar and returns them as a list.\n",
     "\n",
@@ -851,8 +839,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_all_features(grammar: Grammar) -> List[Feature]:\n",
-    "    return extract_existence(grammar) + extract_numeric(grammar)"
+    "def extract_all_features(grammar: Grammar) -> List[Feature]:\n",
+    "    return (extract_existence_features(grammar)\n",
+    "            + extract_numeric_features(grammar))"
    ]
   },
   {
@@ -861,7 +850,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "get_all_features(CALC_GRAMMAR)"
+    "extract_all_features(CALC_GRAMMAR)"
    ]
   },
   {
@@ -870,7 +859,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "[f.friendly_name() for f in get_all_features(CALC_GRAMMAR)]   "
+    "[f.friendly_name() for f in extract_all_features(CALC_GRAMMAR)]"
    ]
   },
   {
@@ -899,7 +888,7 @@
     "    data = []\n",
     "\n",
     "    # parse grammar and extract features\n",
-    "    all_features = get_all_features(grammar)\n",
+    "    all_features = extract_all_features(grammar)\n",
     "\n",
     "    # iterate over all samples\n",
     "    for sample in sample_list:\n",
@@ -951,7 +940,7 @@
     "\n",
     "    features = {}\n",
     "    for tree in earley.parse(sample):\n",
-    "        for feature in get_all_features(CALC_GRAMMAR):\n",
+    "        for feature in extract_all_features(CALC_GRAMMAR):\n",
     "            features[feature.name_rep()] = feature.get_feature_value(tree)\n",
     "    return features"
    ]
@@ -962,7 +951,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_features = get_all_features(CALC_GRAMMAR)\n",
+    "all_features = extract_all_features(CALC_GRAMMAR)\n",
     "for sample in sample_list:\n",
     "    print(f\"Features of {sample}:\")\n",
     "    features = compute_feature_values(sample, CALC_GRAMMAR, all_features)\n",
@@ -974,7 +963,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Transforming Grammars"
+    "### Excursion: Transforming Grammars"
    ]
   },
   {
@@ -1001,13 +990,6 @@
     "For the grammar transformation, we perform a *rewrite step* that for each non-terminal symbol in the grammar, determines the word derived by this symbol in the input and adds it as an alternative to the symbol (as written in the Alhazen-paper). Here, we iterate through the derivation tree of the input and add the derived word of each non-terminal as alternatives to the grammar."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Implementing `transform_grammar()`"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1106,7 +1088,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Learning Syntactical Features"
+    "### End of Excursion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Train Classification Model"
    ]
   },
   {
@@ -1262,6 +1251,53 @@
     "show_decision_tree(clf, vec.get_feature_names_out())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "\n",
+    "def friendly_decision_tree(clf, feature_names, class_names = ['NO_BUG', 'BUG']):\n",
+    "    def _tree(index, indent=0):\n",
+    "        s = \"\"\n",
+    "        feature = clf.tree_.feature[index]\n",
+    "        feature_name = feature_names[feature]\n",
+    "        threshold = clf.tree_.threshold[index]\n",
+    "        value = clf.tree_.value[index]\n",
+    "        class_ = int(value[0][0])\n",
+    "        class_name = class_names[class_]\n",
+    "        left = clf.tree_.children_left[index]\n",
+    "        right = clf.tree_.children_right[index]\n",
+    "        if left == right:\n",
+    "            # Leaf node\n",
+    "            s += \" \" * indent + class_name + \"\\n\"\n",
+    "        else:\n",
+    "            if math.isclose(threshold, 0.5):\n",
+    "                s += \" \" * indent + f\"if {feature_name}:\\n\"\n",
+    "                s += _tree(right, indent + 2)\n",
+    "                s += \" \" * indent + f\"else:\\n\"\n",
+    "                s += _tree(left, indent + 2)\n",
+    "            else:\n",
+    "                s += \" \" * indent + f\"if {feature_name} <= {threshold:.4f}:\\n\"\n",
+    "                s += _tree(left, indent + 2)\n",
+    "                s += \" \" * indent + f\"else:\\n\"\n",
+    "                s += _tree(right, indent + 2)\n",
+    "        return s\n",
+    "\n",
+    "    return _tree(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(friendly_decision_tree(clf, vec.get_feature_names_out()))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1354,7 +1390,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Extracting Requirements"
+    "## Step 3: Extract Feature Requirements"
    ]
   },
   {
@@ -1435,6 +1471,15 @@
     "graph"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(friendly_decision_tree(clf, feature_names, class_names = ['NO_BUG', 'BUG']))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1695,7 +1740,6 @@
    "outputs": [],
    "source": [
     "import logging\n",
-    "from pathlib import Path\n",
     "\n",
     "def tree_to_paths(tree, features: List[Feature]):\n",
     "    logging.info(\"Extracting requirements from tree ...\")\n",
@@ -1723,7 +1767,6 @@
    "outputs": [],
    "source": [
     "class TreeRequirement:\n",
-    "\n",
     "    def __init__(self, feature: Feature, mini, maxi):\n",
     "        self.__feature: Feature = feature\n",
     "        self.__mini = mini\n",
@@ -1802,11 +1845,18 @@
     "        elif not numpy.isinf(self.__maxi):\n",
     "            return [f\"{self.__feature} > {self.__maxi}\"]\n",
     "        else:\n",
-    "            return [f\"{self.__feature} <= {self.__mini}\"]\n",
-    "\n",
+    "            return [f\"{self.__feature} <= {self.__mini}\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
     "\n",
     "class TreePath:\n",
-    "\n",
     "    def __init__(self, samplefile: Optional[Path], is_bug: bool, requirements: List[TreeRequirement]):\n",
     "        self.__sample = samplefile\n",
     "        self.__is_bug = is_bug\n",
@@ -1883,7 +1933,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Negating Requirements"
+    "## Step 4: Generating New Samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Negating Requirements"
    ]
   },
   {
@@ -2347,7 +2404,7 @@
     "                                 'InputSpecification(Requirement(exists(<digit>) > 1.5), Requirement(exists(<function>@0) > 0.5), Requirement(num(<term>) <= 0.21850000321865082))',\n",
     "                                 'InputSpecification(Requirement(exists(<function>@0) > 0.5))']\n",
     "\n",
-    "all_features = extract_existence(CALC_GRAMMAR) + extract_numeric(CALC_GRAMMAR)\n",
+    "all_features = extract_all_features(CALC_GRAMMAR)\n",
     "\n",
     "earley = EarleyParser(SPEC_GRAMMAR)\n",
     "for count, sample in enumerate(sample_prediction_paths):\n",
@@ -2371,13 +2428,6 @@
     "### End of Excursion"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Generating New Samples"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2515,6 +2565,7 @@
     "        done, best_chosen = best_trees(best_chosen, spec)\n",
     "        if done:\n",
     "            final_samples.append(tree_to_string(best_chosen))\n",
+    "\n",
     "        while not done and time.time() - starttime < each_spec_timeout:\n",
     "            # split in prefix, postfix and try to reach targets\n",
     "            for tree in best_chosen:\n",
@@ -2641,7 +2692,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Putting it all Together"
+    "## Step 5: Executing New Input Files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The Alhazen Class"
    ]
   },
   {
@@ -2651,28 +2709,47 @@
    "outputs": [],
    "source": [
     "class Alhazen:\n",
-    "    def __init__(self, initial_inputs: List[str],\n",
+    "    def __init__(self,\n",
+    "                 runner: Any,\n",
     "                 grammar: Grammar,\n",
-    "                 max_iter: int = 10,\n",
+    "                 initial_inputs: List[str], /,\n",
+    "                 max_iterations: int = 10,\n",
     "                 generator_timeout: int = 10):\n",
     "\n",
     "        self._initial_inputs = initial_inputs\n",
     "        self._grammar = grammar\n",
-    "        self._max_iter = max_iter\n",
+    "        self._runner = runner\n",
+    "        self._max_iter = max_iterations\n",
     "        self._previous_samples = None\n",
     "        self._data = None\n",
     "        self._trees = []\n",
     "        self._generator_timeout = generator_timeout\n",
-    "        self._setup()\n",
-    "\n",
+    "        self._setup()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Alhazen(Alhazen):\n",
     "    def _setup(self):\n",
     "        self._previous_samples = self._initial_inputs\n",
     "\n",
-    "        self._all_features = extract_existence(self._grammar) + extract_numeric(self._grammar)\n",
+    "        self._all_features = extract_all_features(self._grammar)\n",
     "        self._feature_names = [f.name for f in self._all_features]\n",
-    "        print(\"Features:\", \", \".join(f.friendly_name() \n",
-    "                                     for f in self._all_features))\n",
-    "\n",
+    "        print(\"Features:\", \", \".join(f.friendly_name()\n",
+    "                                     for f in self._all_features))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Alhazen(Alhazen):\n",
     "    def _add_new_data(self, exec_data, feature_data):\n",
     "        joined_data = exec_data.join(feature_data.drop(['sample'], axis=1))\n",
     "\n",
@@ -2683,12 +2760,39 @@
     "            if self._data is None:\n",
     "                self._data = new_data\n",
     "            else:\n",
-    "                self._data = pandas.concat([self._data, new_data], sort=False)\n",
-    "\n",
+    "                self._data = pandas.concat([self._data, new_data], sort=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Alhazen(Alhazen):\n",
     "    def _finalize(self):\n",
     "        return self._trees"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Alhazen(Alhazen):\n",
+    "    def execute_samples(self, sample_list = None):\n",
+    "        if sample_list is None:\n",
+    "            sample_list = self._initial_inputs\n",
+    "\n",
+    "        data = []\n",
+    "        for sample in sample_list:\n",
+    "            result = self._runner(sample)\n",
+    "            data.append({\"oracle\": result })\n",
+    "\n",
+    "        return pandas.DataFrame.from_records(data)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -2698,14 +2802,22 @@
     "class Alhazen(Alhazen):\n",
     "    def run(self):\n",
     "        for iteration in range(self._max_iter):\n",
-    "            print(f\"Iteration #{iteration}\")\n",
-    "            self._loop(self._previous_samples)\n",
-    "\n",
-    "        return self._finalize()\n",
+    "            print(f\"\\nIteration #{iteration}\")\n",
+    "            self._iterate(self._previous_samples)\n",
     "\n",
-    "    def _loop(self, sample_list):\n",
+    "        return self._finalize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Alhazen(Alhazen):\n",
+    "    def _iterate(self, sample_list):\n",
     "        # Obtain labels, execute samples (Initial Step)\n",
-    "        exec_data = execute_samples(sample_list)\n",
+    "        exec_data = self.execute_samples(sample_list)\n",
     "\n",
     "        # Collect features from the new samples\n",
     "        feature_data = collect_features(sample_list, self._grammar)\n",
@@ -2736,6 +2848,13 @@
     "        self._previous_samples = new_samples"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A Sample Run"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2752,7 +2871,7 @@
     "# Set the number of refinement iterations and the timeout for the input generator\n",
     "# The execution time of Alhazen mainly depends on the number of iterations\n",
     "\n",
-    "MAX_ITERATION = 20\n",
+    "MAX_ITERATIONS = 20\n",
     "GENERATOR_TIMEOUT = 10 # timeout in seconds"
    ]
   },
@@ -2763,8 +2882,9 @@
    "outputs": [],
    "source": [
     "# We initialize Alhazen with the previously used sample_list (['sqrt(-16)', 'sqrt(4)'])\n",
-    "alhazen = Alhazen(initial_sample_list,\n",
-    "                  CALC_GRAMMAR, MAX_ITERATION, GENERATOR_TIMEOUT)\n",
+    "alhazen = Alhazen(sample_runner, CALC_GRAMMAR, initial_sample_list,\n",
+    "                  max_iterations=MAX_ITERATIONS,\n",
+    "                  generator_timeout=GENERATOR_TIMEOUT)\n",
     "\n",
     "# and run it\n",
     "# Alhazen returns a list of all the iteratively learned decision trees\n",
@@ -2784,7 +2904,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "final_tree = trees[MAX_ITERATION-1]\n",
+    "final_tree = trees[MAX_ITERATIONS-1]\n",
     "final_tree"
    ]
   },
@@ -2794,7 +2914,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_features = get_all_features(CALC_GRAMMAR)\n",
+    "all_features = extract_all_features(CALC_GRAMMAR)\n",
     "all_feature_names = [f.friendly_name() for f in all_features]"
    ]
   },
@@ -2823,44 +2943,6 @@
     "show_decision_tree(remove_unequal_decisions(final_tree), all_feature_names)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import math\n",
-    "\n",
-    "def friendly_decision_tree(clf, feature_names, class_names = ['NO_BUG', 'BUG']):\n",
-    "    def _tree(index, indent=0):\n",
-    "        s = \"\"\n",
-    "        feature = clf.tree_.feature[index]\n",
-    "        feature_name = feature_names[feature]\n",
-    "        threshold = clf.tree_.threshold[index]\n",
-    "        value = clf.tree_.value[index]\n",
-    "        class_ = int(value[0][0])\n",
-    "        class_name = class_names[class_]\n",
-    "        left = clf.tree_.children_left[index]\n",
-    "        right = clf.tree_.children_right[index]\n",
-    "        if left == right:\n",
-    "            # Leaf node\n",
-    "            s += \" \" * indent + class_name + \"\\n\"\n",
-    "        else:\n",
-    "            if math.isclose(threshold, 0.5):\n",
-    "                s += \" \" * indent + f\"if {feature_name}:\\n\"\n",
-    "                s += _tree(right, indent + 2)\n",
-    "                s += \" \" * indent + f\"else:\\n\"\n",
-    "                s += _tree(left, indent + 2)\n",
-    "            else:\n",
-    "                s += \" \" * indent + f\"if {feature_name} <= {threshold:.4f}:\\n\"\n",
-    "                s += _tree(left, indent + 2)\n",
-    "                s += \" \" * indent + f\"else:\\n\"\n",
-    "                s += _tree(right, indent + 2)\n",
-    "        return s\n",
-    "\n",
-    "    return _tree(0)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,