adding code for datarinse, codebreaker

wala · Oct 17, 2023 · dad9df7 · dad9df7
1 parent 4911bbc
commit dad9df7
Show file tree

Hide file tree

Showing 276 changed files with 1,164,646 additions and 356 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Website: https://wala.github.io/graph4code/
 * [Large Scale Generation of Labeled Type Data for Python](./docs/use_cases.md#type_inf)
 * [Recommendation engine for developers](./docs/use_cases.md#case1)   
 * [Enforcing best practices](./docs/use_cases.md#case2)  
-* [Learning from big code](./docs/use_cases.md#case3) 
+* [Learning from big code](./docs/use_cases.md#case3)
 
 # Create your own graph
 
@@ -69,18 +69,14 @@ You will also need an installation of `Java JDK 11` for running the jars of code
 
 If you have a new script (code file), run the following command in the jars directory.  Please ensure you have Java 11 before you run.  Note that the last two arguments are to create a unique graph URI for each script that gets analyzed, where the graph URI is made up of <graph prefix> + '/' + <graph qualifier> for a single file.  Note also that we have migrated the RDF store model to RDF* to make it a more compact, easier to understand representation.  We have also added more information about each node.  Model definition will be updated soon.
 
-We provide analysis for both Python 2 and Python 3.  Python 3 is the supported version of Python, but, while Python 2 is no longer supported, many existing datasets have significant quantities it.  Since the two languages have different syntax in some cases, we need two different analyses that rely on diffferent parsers, and hence we have two analysis jars.
+We provide analysis for both Python 2 and Python 3.  Python 3 is the supported version of Python, but, while Python 2 is no longer supported, many existing datasets have significant quantities it.  Since the two languages have different syntax in some cases, we need two different analyses that rely on diffferent parsers, and hence we have two analysis jars.  ** All source code for the files that perform operations on the analysis graphs is now included - see directories that start with the string codebreaker. **
 
-#### Download code analysis libraries
+#### Build the code analysis libraries
 ```
- cd jars
- wget https://archive.org/download/code-breaker-py-3-0.0.1-snapshot/CodeBreaker_py3-0.0.1-SNAPSHOT.jar
+ cd scripts
+ bash setup.sh
 ```
-For corresponding sources:
 
-``
-wget https://archive.org/download/code-breaker-py-3-0.0.1-snapshot-sources/CodeBreaker_py3-0.0.1-SNAPSHOT-sources.jar
-``
 
 #### Usage:
 
@@ -162,6 +158,23 @@ Current accepted prefixes are ai_stackexchange, math_stackexchange, datascience_
 * If you use Graph4CodeGen in your research, please cite our work:
 
 ```
+@article{10.14778/3611540.3611628,
+author = {Abdelaziz, Ibrahim and Dolby, Julian and Khurana, Udayan and Samulowitz, Horst and Srinivas, Kavitha},
+title = {DataRinse: Semantic Transforms for Data Preparation Based on Code Mining},
+year = {2023},
+issue_date = {August 2023},
+publisher = {VLDB Endowment},
+volume = {16},
+number = {12},
+issn = {2150-8097},
+url = {https://doi.org/10.14778/3611540.3611628},
+doi = {10.14778/3611540.3611628},
+abstract = {Data preparation is a crucial first step to any data analysis problem. This task is largely manual, performed by a person familiar with the data domain. DataRinse is a system designed to extract relevant transforms from large scale static analysis of repositories of code. Our motivation is that in any large enterprise, multiple personas such as data engineers and data scientists work on similar datasets. However, sharing or re-using that code is not obvious and difficult to execute. In this paper, we demonstrate DataRinse to handle data preparation, such that the system recommends code designed to help with the preparation of a column for data analysis more generally. We show that DataRinse does not simply shard expressions observed in code but also uses analysis to group expressions applied to the same field such that related transforms appear coherently to a user. It is a human-in-the-loop system where the users select relevant code snippets produced by DataRinse to apply on their dataset.},
+journal = {Proc. VLDB Endow.},
+month = {sep},
+pages = {4090–4093},
+numpages = {4}
+}
 @inproceedings{abdelaziz2023semforms,
       title={ SemFORMS: Automatic Generation of Semantic Transforms By Mining Data Science Code }, 
       author={Ibrahim Abdelaziz, Julian Dolby, Udayan Khurana, Horst Samulowitz, Kavitha Srinivas,

diff --git a/code_breaker/.DS_Store b/code_breaker/.DS_Store
diff --git a/code_breaker/.classpath b/code_breaker/.classpath
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+		<attributes>
+			<attribute name="test" value="true"/>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
+		<attributes>
+			<attribute name="test" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-11">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/code_breaker/.gitignore b/code_breaker/.gitignore
@@ -0,0 +1,3 @@
+/target/
+/dependency-reduced-pom.xml
+/.idea/
diff --git a/code_breaker/.project b/code_breaker/.project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>CodeBreaker</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+	</natures>
+</projectDescription>
diff --git a/code_breaker/.settings/org.eclipse.jdt.core.prefs b/code_breaker/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,16 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=11
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=11
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=warning
+org.eclipse.jdt.core.compiler.release=disabled
+org.eclipse.jdt.core.compiler.source=11
diff --git a/code_breaker/.settings/org.eclipse.m2e.core.prefs b/code_breaker/.settings/org.eclipse.m2e.core.prefs
@@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
diff --git a/code_breaker/CodeBreaker.iml b/code_breaker/CodeBreaker.iml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_11">
+    <output url="file://$MODULE_DIR$/target/classes" />
+    <output-test url="file://$MODULE_DIR$/target/test-classes" />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
+      <excludeFolder url="file://$MODULE_DIR$/target" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="Maven: CodeKnowledgeGraph:CodeBreakerBase:0.0.1-SNAPSHOT" level="project" />
+    <orderEntry type="library" name="Maven: co.elastic.clients:elasticsearch-java:8.2.3" level="project" />
+    <orderEntry type="library" name="Maven: org.elasticsearch.client:elasticsearch-rest-client:8.2.3" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5.10" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.12" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpasyncclient:4.1.4" level="project" />
+    <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore-nio:4.4.12" level="project" />
+    <orderEntry type="library" name="Maven: commons-codec:commons-codec:1.14" level="project" />
+    <orderEntry type="library" name="Maven: commons-logging:commons-logging:1.1.3" level="project" />
+    <orderEntry type="library" name="Maven: com.google.code.findbugs:jsr305:3.0.2" level="project" />
+    <orderEntry type="library" name="Maven: jakarta.json:jakarta.json-api:2.0.1" level="project" />
+    <orderEntry type="library" name="Maven: org.eclipse.parsson:parsson:1.0.0" level="project" />
+    <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.12.3" level="project" />
+    <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.12.3" level="project" />
+    <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.12.3" level="project" />
+    <orderEntry type="library" name="Maven: org.json:json:20230227" level="project" />
+  </component>
+</module>
diff --git a/code_breaker/README.md b/code_breaker/README.md
@@ -0,0 +1,10 @@
+The script `code_knowledge_graph/scripts/setup.sh` has the steps to build the analysis code for both Python 2 and Python 3.  In the parent directory of the repository, run `bash code_knowledge_graph/scripts/setup.sh` to build the code.
+
+To run the code on a given python file:
+
+ - For Python 3: in code_breaker_py3: `java -DquadFile=<nq file name> -DoutputDir=<dir for json files> -cp target/CodeBreaker_py3-0.0.1-SNAPSHOT.jar util.RunTurtleSingleAnalysis <dir containing files/single file> <repoPath> <path>` for running the analysis on a given python file.
+
+ - For Python 2: In code_breaker_py2: `java -DquadFile=<nq file name> -DoutputDir=<dir for json files> -cp target/CodeBreaker_py2-0.0.1-SNAPSHOT.jar util.RunTurtleSingleAnalysis <dir containing files/single file> <repoPath> <path>` for running the analysis on a given python file.
+
+To run summaries for data science pipelines:
+In code_breaker_py3: `java -cp target/CodeBreaker_py3-0.0.1-SNAPSHOT.jar util.SummarizeDataScienceGraphsFromJSON <input JSON file from analysis> <output JSON file to store subgraphs>` for running the analysis on a given python file.
diff --git a/code_breaker/data/CodeNet_sample/README b/code_breaker/data/CodeNet_sample/README
@@ -0,0 +1,30 @@
+generated by:
+
+/Volume1/AI4CODE/CodeNet/AI4Code-Datasets/scripts/codenet_aggregate.sh -c=julian-samples.conf -o=julian
+
+from config file:
+
+PROBLEMS=(
+  p00017	# Caesar Cipher
+  p00029	# English Sentence
+  p00052	# Factorial II
+  p00061	# Rank Checker
+  p00105	# Book Index
+)
+
+LANGUAGES=(
+  Java
+)
+
+STATUSES=(
+  Accepted
+)
+
+# Select on code size (>= 0). Bounds are inclusive.
+MIN_CODE_SIZE=50
+MAX_CODE_SIZE=			# undefined means unlimited
+
+# How many samples to provide per selection.
+# A selection is a problem/language/status combination.
+# When not defined, it means all available.
+NUM_SAMPLES=10
diff --git a/code_breaker/data/CodeNet_sample/p00017/s091933654/Main.java b/code_breaker/data/CodeNet_sample/p00017/s091933654/Main.java
@@ -0,0 +1,121 @@
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+public class Main {
+
+    private static final char START_ALPHABET = 'a';
+    private static final char END_ALPHABET = 'z';
+    private static final char CHAR_T = 't';
+    private static final char CHAR_PERIOD = '.';
+    private static final String COMPARE_THE = "the";
+    private static final String COMPARE_THIS = "this";
+    private static final String COMPARE_THAT = "that";
+    private static final String BLANK = " ";
+
+    public static void main(String[] args) {
+
+        BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+
+        try {
+            String input = null;
+            while ((input = in.readLine()) != null) {
+
+                String[] cutData = input.split(BLANK);
+                int caesarNum = solveCaesarNum(cutData);
+
+                StringBuilder result = new StringBuilder();
+
+                for (String data : cutData) {
+                    result.append(routation(data, caesarNum)).append(BLANK);
+                }
+
+                // 最後の空白を削除
+                result.delete(result.length() - 1, result.length());
+
+                System.out.println(result.toString());
+            }
+
+        } catch (IOException e) {
+            e.printStackTrace();
+        } finally {
+            try {
+                in.close();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+    }
+
+    private static int solveCaesarNum(String[] cutData) {
+
+        int cnt = 0;
+        boolean stopFlg = false;
+
+        while (cnt < END_ALPHABET - START_ALPHABET + 1) {
+
+            for (String data : cutData) {
+
+                String checkData = data;
+
+                // 単語の最後がピリオドならば、ピリオドは除外
+                if (data.charAt(data.length() - 1) == CHAR_PERIOD) {
+                    checkData = data.substring(0, data.length() - 1);
+                }
+
+                if (checkData.length() == 3) {
+                    String checkThe = routation(checkData,
+                            CHAR_T - checkData.charAt(0));
+
+                    if (COMPARE_THE.equals(checkThe)) {
+                        stopFlg = true;
+                        cnt = CHAR_T - checkData.charAt(0);
+                        break;
+                    }
+                } else if (checkData.length() == 4) {
+
+                    String checkThe = routation(checkData,
+                            CHAR_T - checkData.charAt(0));
+
+                    if (COMPARE_THAT.equals(checkThe)
+                            || COMPARE_THIS.equals(checkThe)) {
+                        stopFlg = true;
+                        cnt = CHAR_T - checkData.charAt(0);
+                        break;
+                    }
+                }
+            }
+
+            if (stopFlg) {
+                break;
+            }
+
+            cnt++;
+        }
+
+        return cnt;
+    }
+
+    private static String routation(String data, int caesarNum) {
+
+        char[] convertChar = data.toCharArray();
+
+        for (int i = 0; i < convertChar.length; i++) {
+
+            // カンマはスルー
+            if (convertChar[i] == CHAR_PERIOD) {
+                continue;
+            }
+
+            convertChar[i] += caesarNum;
+
+            if (convertChar[i] < START_ALPHABET) {
+                convertChar[i] += END_ALPHABET - START_ALPHABET + 1;
+            } else if (convertChar[i] > END_ALPHABET) {
+                convertChar[i] += START_ALPHABET - END_ALPHABET - 1;
+            }
+        }
+
+        return String.valueOf(convertChar);
+    }
+}
diff --git a/code_breaker/data/CodeNet_sample/p00017/s236746930/Main.java b/code_breaker/data/CodeNet_sample/p00017/s236746930/Main.java
@@ -0,0 +1,47 @@
+import java.util.Scanner;
+class Main {
+	public static void main(String args[]){
+		Scanner scan = new Scanner(System.in);
+
+		while(scan.hasNextLine()){
+			String line = scan.nextLine();
+			String[] cipher = line.split(" ");
+			char[][] ch = new char[cipher.length][];
+			int i,j;
+
+			for(i=0;i<cipher.length;i++){
+				ch[i] = cipher[i].toCharArray();
+			}
+
+			String[] str = new String[cipher.length];
+			String s = "";
+			out : while(true){
+				for(i=0;i<cipher.length;i++){
+					for(j=0;j<cipher[i].length();j++){
+						if(ch[i][j]!='.' && ch[i][j]!=' '){
+							if(ch[i][j] == 'z'){
+								ch[i][j] = 'a';
+							}else{
+								ch[i][j] = (char)(ch[i][j]+1);
+							}
+						}
+					}
+					str[i] = String.valueOf(ch[i]);
+				}
+
+				for(i=0;i<cipher.length;i++){
+					if(str[i].equals("the") || str[i].equals("this") || str[i].equals("that")
+							|| str[i].equals("the.") || str[i].equals("this.") || str[i].equals("that.")){
+						break out;
+					}
+				}
+			}
+
+			for(i=0;i<cipher.length-1;i++){
+				s+=(str[i]+" ");
+			}
+			s+=str[cipher.length-1];
+			System.out.println(s);
+		}
+	}
+}