Merge pull request #1 from codefuse-ai/init

Initial PR from Ant Program Analysis Team
codefuse-ai · Nov 27, 2023 · b3ddb88 · b3ddb88
2 parents b75f0e2 + b75a5a7
commit b3ddb88
Show file tree

Hide file tree

Showing 245 changed files with 58,891 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,36 @@
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**/target/
+!**/src/test/**/target/
+logs/
+*.log
+.DS_Store
+
+### STS ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### IntelliJ IDEA ###
+.idea
+*.iws
+*.iml
+*.ipr
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### VS Code ###
+.vscode/
+.cloudide
diff --git a/LEGAL.md b/LEGAL.md
@@ -0,0 +1,7 @@
+Legal Disclaimer
+
+Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
+
+法律免责声明
+
+关于代码注释部分，中文注释为官方版本，其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致，当中文注释与其它语言注释存在不一致时，请以中文注释为准。
diff --git a/README.md b/README.md
diff --git a/cli/database/__init__.py b/cli/database/__init__.py
diff --git a/cli/database/create.py b/cli/database/create.py
@@ -0,0 +1,122 @@
+import json
+import re
+import time
+from pathlib import Path
+
+from extractor.extractor import *
+
+
+def conf_check(args):
+    src_path = Path(args.source_root).expanduser().resolve()
+    if not src_path.exists():
+        logging.error("source not exist, Please redefine the directory or file by --source-root or -s")
+        return False
+    output_path = Path(args.output).expanduser().resolve()
+    if not output_path.exists():
+        logging.warning("%s not exists, it will be created", str(output_path))
+        try:
+            output_path.mkdir(parents=True)
+            logging.info("%s success build", str(output_path))
+        except Exception as e:
+            logging.error("can not to create database directory %s: %s", str(output_path), e)
+            return False
+    if args.extraction_config_file:
+        extraction_config_file_path = Path(args.output).expanduser().resolve()
+        if not extraction_config_file_path.exists():
+            logging.error("extraction_config_file: %s not exists, please redefine by --extraction-config-file",
+                          args.extraction_config_file)
+            return False
+    return True
+
+
+def memory_statistics():
+    memory = psutil.virtual_memory()
+
+    # 获取总内存大小（以字节为单位）
+    total_memory = memory.total
+
+    # 格式化内存大小
+    size_units = ["B", "KB", "MB", "GB", "TB"]
+    unit_index = 0
+    while total_memory >= 1024 and unit_index < len(size_units) - 1:
+        total_memory /= 1024
+        unit_index += 1
+    logging.info(f"current memory is : {total_memory:.2f} {size_units[unit_index]}")
+    logging.info(f"final -Xmx is : {max(total_memory - 1, 6):.2f} {size_units[unit_index]}")
+
+
+def conf_option_deal(args):
+    options = dict()
+    if args.extraction_config_file:
+        try:
+            with open(args.extraction_config_file, "r") as f:
+                options = json.load(f)
+        except Exception as e:
+            logging.error(e)
+            return -1
+    for language in args.language:
+        options[language] = dict()
+    if args.extraction_config:
+        # 要求option必须是a.b=c的形式，a为语言名，若不是报错
+        pattern = r'^(.+)\.(.+)\=(.+)$'
+        for tmp in args.extraction_config:
+            match = re.match(pattern, tmp)
+            if match:
+                language = match.group(1)
+                key = match.group(2)
+                value = match.group(3)
+                # 若option与需要抽取的语言对不上, 报错并返回配置错误
+                if language not in args.language:
+                    logging.error("option language error: %s does not need to be extracted", language)
+                    return -1
+                options[language][key] = value
+            else:
+                logging.error("option format error: %s, it need like java.a=b", tmp)
+                return -1
+    return options
+
+
+def database_create(args):
+    if not conf_check(args):
+        logging.error("configuration error, Please check conf")
+        raise ValueError("configuration error")
+    options = conf_option_deal(args)
+    if options == -1:
+        logging.error("configuration error, Please check conf")
+        raise ValueError("configuration error")
+    memory_statistics()
+    timeout = args.timeout
+    extractor_fail = list()
+    for language in args.language:
+        output_path = Path(args.output).expanduser().resolve()
+        src_path = Path(args.source_root).expanduser().resolve()
+        now_output_path = output_path / ("coref_" + language + "_src.db")
+        # 给用户反悔机会,若添加overwrite选项则直接覆盖存量数据
+        if now_output_path.exists():
+            logging.info(f"{now_output_path} will be create")
+            if not args.overwrite:
+                user_input = input(f"file {now_output_path} Already exists, do you want to overwrite it? (y/n): ")
+                if user_input.lower() != "y":
+                    logging.warning(" %s will use old version data", language)
+                    continue
+        start_time = time.time()
+        return_code = extractor_run(language, src_path, output_path, timeout, options[language])
+        if return_code == 0:
+            logging.info("Finish extracting data source %s with %s language Extractor, extraction is Success, "
+                         "execution time is %.2fs.", args.source_root, language, time.time() - start_time)
+        else:
+            extractor_fail.append(language)
+            logging.error("%s extracting error with %s language Extractor, please check by log",
+                          args.source_root, language)
+        timeout = timeout - time.time() + start_time
+        if timeout < 0:
+            logging.error("extract fail: timeout")
+            raise RuntimeError("timeout")
+    if len(extractor_fail) > 0:
+        for language in extractor_fail:
+            logging.error("%s extract fail, please check log", language)
+        logging.error("extract fail")
+        raise RuntimeError("extract fail")
+    else:
+        logging.info("extract success")
+    return
diff --git a/cli/extractor/__init__.py b/cli/extractor/__init__.py
diff --git a/cli/extractor/extractor.py b/cli/extractor/extractor.py
@@ -0,0 +1,192 @@
+import logging
+
+import psutil
+
+from run.runner import Runner
+from sparrow_schema.schema import sparrow
+
+
+class Extractor:
+    # 若添加抽取器，请命名为language_extractor,并完成language_extractor_cmd函数实现执行指令, 可被相应展示与执行
+    cfamily_extractor = ""
+    go_extractor = ""
+    java_extractor = ""
+    javascript_extractor = ""
+    properties_extractor = ""
+    python_extractor = ""
+    sql_extractor = ""
+    swift_extractor = ""
+    xml_extractor = ""
+
+    def __init__(self):
+        Extractor.cfamily_extractor = sparrow.home / "language" / "cfamily" / "extractor" / "usr" / "bin" / "coref-cfamily-src-extractor"
+        Extractor.go_extractor = sparrow.home / "language" / "go" / "extractor" / "coref-go-src-extractor"
+        Extractor.java_extractor = sparrow.home / "language" / "java" / "extractor" / "coref-java-src-extractor_deploy.jar"
+        Extractor.javascript_extractor = sparrow.home / "language" / "javascript" / "extractor" / "coref-javascript-src-extractor"
+        Extractor.properties_extractor = sparrow.home / "language" / "properties" / "extractor" / "coref-properties-src-extractor_deploy.jar"
+        Extractor.python_extractor = sparrow.home / "language" / "python" / "extractor" / "coref-python-src-extractor"
+        Extractor.sql_extractor = sparrow.home / "language" / "sql" / "extractor" / "coref-sql-src-extractor_deploy.jar"
+        Extractor.swift_extractor = sparrow.home / "language" / "swift" / "extractor" / "usr" / "bin" / "coref-swift-src-extractor"
+        Extractor.xml_extractor = sparrow.home / "language" / "xml" / "extractor" / "coref-xml-extractor_deploy.jar"
+
+
+def cfamily_extractor_cmd(source_root, database, options):
+    cmd = list()
+    cmd += [str(Extractor.cfamily_extractor)]
+    cmd += ["--compile-commands=", str(source_root)]
+    cmd += ["--output-db-path=", str(database)]
+    return cmd
+
+
+def go_extractor_cmd(source_root, database, options):
+    cmd = list()
+    cmd += [str(Extractor.go_extractor)]
+    if options:
+        for (key, value) in options.items():
+            if key == "extract-config":
+                for tmp in value.split(","):
+                    cmd += ["-ex", tmp]
+            elif key == "go-build-flag":
+                for tmp in value.split(","):
+                    cmd += [tmp]
+            else:
+                logging.warning("unsupported config name: %s for Go extractor.", key)
+    cmd += ["-o", str(database/"coref_go_src.db")]
+    cmd += [str(source_root)]
+    return cmd
+
+
+def java_extractor_cmd(source_root, database, options):
+    cmd = list()
+    cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database)
+    if options:
+        for (key, value) in options.items():
+            if key == "white-list" or key == "whiteList":
+                cmd += ["-w=", value]
+            elif key == "cp":
+                cmd += ["-cp=", value]
+            elif key == "classpath":
+                cmd += ["--classpath=", value]
+            elif key == "incremental":
+                if value == "true":
+                    cmd += ["--incremental"]
+                    cmd += ["--cache-dir=" + options.get("cache-dir", "null")]
+                    cmd += ["--commit=" + options.get("commit", "null")]
+                    cmd += ["--remote-cache-type=" + options.get("remote-cache-type", "null")]
+                    cmd += ["--oss-bucket=" + options.get("oss-bucket", "null")]
+                    cmd += ["--oss-config-file=" + options.get("oss-config-file", "null")]
+                    cmd += ["--oss-path-prefix=" + options.get("oss-path-prefix", "null")]
+                else:
+                    logging.warning("java.incremental does not take effect, please use java.incremental=true")
+            else:
+                if key != "cache-dir" and key != "commit" and key != "remote-cache-type" and \
+                        key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix":
+                    logging.warning("unsupported config name:%s for java extractor.", key)
+    if "incremental" not in options or options["incremental"] != "true":
+        cmd += ["--parallel"]
+    return cmd
+
+
+def javascript_extractor_cmd(source_root, database, options):
+    cmd = list()
+    cmd += [str(Extractor.javascript_extractor), "extract"] + \
+           ["--src", str(source_root)] + \
+           ["--db", str(database/"coref_javascript_src.db")]
+    if options:
+        for (key, value) in options.items():
+            if key == "black-list" or key == "blacklist":
+                cmd += ["--blacklist"]
+                for tmp in value.split(','):
+                    cmd += [tmp]
+            elif key == "use-gitignore":
+                if value == "true":
+                    cmd += ["--use-gitignore"]
+                else:
+                    logging.warning("javascript.use-gitignore does not take effect, please use "
+                                    "javascript.use-gitignore=true")
+            elif key == "extract-dist":
+                if value == "true":
+                    cmd += ["--extract-dist"]
+                else:
+                    logging.warning("javascript.extract-dist does not take effect, please use "
+                                    "javascript.extract-dist=true")
+            elif key == "extract-deps":
+                if value == "true":
+                    cmd += ["--extract-deps"]
+                else:
+                    logging.warning("javascript.extract-deps does not take effect, please use "
+                                    "javascript.extract-deps=true")
+            elif key == "file-size-limit":
+                cmd += ["--file-size-limit", value]
+            else:
+                logging.warning("unsupported config name:%s for javascript extractor.", key)
+    return cmd
+
+
+def properties_extractor_cmd(source_root, database, options):
+    cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database)
+    return cmd
+
+
+def python_extractor_cmd(source_root, database, options):
+    cmd = list()
+    cmd += [str(Extractor.python_extractor), "-s", str(source_root), "-d", str(database)]
+    return cmd
+
+
+def sql_extractor_cmd(source_root, database, options):
+    cmd = list()
+    cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database)
+    if "sql-dialect-type" in options:
+        cmd += ["--sql-dialect-type", options["sql-dialect-type"]]
+    return cmd
+
+
+def swift_extractor(source_root, database, options):
+    cmd = list()
+    cmd += [str(Extractor.swift_extractor), str(source_root), str(database)]
+    if options:
+        for (key, value) in options.items():
+            if key == "corpus":
+                for tmp in value.split(","):
+                    cmd += ["--corpus", tmp]
+            else:
+                logging.warning("unsupported config name:%s for Swift extractor.", key)
+    return cmd
+
+
+def xml_extractor_cmd(source_root, database, options):
+    cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database)
+    return cmd
+
+
+def jar_extractor_cmd(extractor_path, source_root, database):
+    # 获取内存信息
+    mem = psutil.virtual_memory()
+    total_memory = mem.total
+    total_memory_gb = round(total_memory / (1024 ** 3))
+    logging.info("current memory is : %s GB", total_memory_gb)
+    xmx = max(total_memory_gb - 1, 6)
+    logging.info("final -Xmx is: %s GB", xmx)
+    cmd = list()
+    cmd += ["java", "-jar", "-Xmx" + str(xmx) + "g", str(extractor_path)]
+    cmd += [str(source_root), str(database)]
+    return cmd
+
+
+def extractor_run(language, source_root, database, timeout, options):
+    function_name = language + "_extractor_cmd"
+    if function_name in globals():
+        # 通过语言名确定函数名并直接调用对应抽取器执行函数，获取执行指令
+        function = globals()[function_name]
+        cmd = function(source_root, database, options)
+        if cmd == -1:
+            logging.error("option error")
+            logging.error("Failed to obtain the %s extractor", language)
+            return -1
+        tmp = Runner(cmd, timeout)
+        return tmp.subrun()
+    else:
+        logging.error("Not supported language: %s", language)
+        return -1
+
diff --git a/cli/godel/__init__.py b/cli/godel/__init__.py