Skip to content

Commit

Permalink
Merge pull request #1 from codefuse-ai/init
Browse files Browse the repository at this point in the history
Initial PR from Ant Program Analysis Team
  • Loading branch information
zhouang777 authored Nov 27, 2023
2 parents b75f0e2 + b75a5a7 commit b3ddb88
Show file tree
Hide file tree
Showing 245 changed files with 58,891 additions and 0 deletions.
36 changes: 36 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
logs/
*.log
.DS_Store

### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache

### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr

### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/

### VS Code ###
.vscode/
.cloudide
7 changes: 7 additions & 0 deletions LEGAL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Legal Disclaimer

Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.

法律免责声明

关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。
Empty file added README.md
Empty file.
Empty file added cli/database/__init__.py
Empty file.
122 changes: 122 additions & 0 deletions cli/database/create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import json
import re
import time
from pathlib import Path

from extractor.extractor import *


def conf_check(args):
src_path = Path(args.source_root).expanduser().resolve()
if not src_path.exists():
logging.error("source not exist, Please redefine the directory or file by --source-root or -s")
return False
output_path = Path(args.output).expanduser().resolve()
if not output_path.exists():
logging.warning("%s not exists, it will be created", str(output_path))
try:
output_path.mkdir(parents=True)
logging.info("%s success build", str(output_path))
except Exception as e:
logging.error("can not to create database directory %s: %s", str(output_path), e)
return False
if args.extraction_config_file:
extraction_config_file_path = Path(args.output).expanduser().resolve()
if not extraction_config_file_path.exists():
logging.error("extraction_config_file: %s not exists, please redefine by --extraction-config-file",
args.extraction_config_file)
return False
return True


def memory_statistics():
memory = psutil.virtual_memory()

# 获取总内存大小(以字节为单位)
total_memory = memory.total

# 格式化内存大小
size_units = ["B", "KB", "MB", "GB", "TB"]
unit_index = 0
while total_memory >= 1024 and unit_index < len(size_units) - 1:
total_memory /= 1024
unit_index += 1
logging.info(f"current memory is : {total_memory:.2f} {size_units[unit_index]}")
logging.info(f"final -Xmx is : {max(total_memory - 1, 6):.2f} {size_units[unit_index]}")


def conf_option_deal(args):
options = dict()
if args.extraction_config_file:
try:
with open(args.extraction_config_file, "r") as f:
options = json.load(f)
except Exception as e:
logging.error(e)
return -1
for language in args.language:
options[language] = dict()
if args.extraction_config:
# 要求option必须是a.b=c的形式,a为语言名,若不是报错
pattern = r'^(.+)\.(.+)\=(.+)$'
for tmp in args.extraction_config:
match = re.match(pattern, tmp)
if match:
language = match.group(1)
key = match.group(2)
value = match.group(3)
# 若option与需要抽取的语言对不上, 报错并返回配置错误
if language not in args.language:
logging.error("option language error: %s does not need to be extracted", language)
return -1
options[language][key] = value
else:
logging.error("option format error: %s, it need like java.a=b", tmp)
return -1
return options


def database_create(args):
if not conf_check(args):
logging.error("configuration error, Please check conf")
raise ValueError("configuration error")
options = conf_option_deal(args)
if options == -1:
logging.error("configuration error, Please check conf")
raise ValueError("configuration error")
memory_statistics()
timeout = args.timeout
extractor_fail = list()
for language in args.language:
output_path = Path(args.output).expanduser().resolve()
src_path = Path(args.source_root).expanduser().resolve()
now_output_path = output_path / ("coref_" + language + "_src.db")
# 给用户反悔机会,若添加overwrite选项则直接覆盖存量数据
if now_output_path.exists():
logging.info(f"{now_output_path} will be create")
if not args.overwrite:
user_input = input(f"file {now_output_path} Already exists, do you want to overwrite it? (y/n): ")
if user_input.lower() != "y":
logging.warning(" %s will use old version data", language)
continue
start_time = time.time()
return_code = extractor_run(language, src_path, output_path, timeout, options[language])
if return_code == 0:
logging.info("Finish extracting data source %s with %s language Extractor, extraction is Success, "
"execution time is %.2fs.", args.source_root, language, time.time() - start_time)
else:
extractor_fail.append(language)
logging.error("%s extracting error with %s language Extractor, please check by log",
args.source_root, language)
timeout = timeout - time.time() + start_time
if timeout < 0:
logging.error("extract fail: timeout")
raise RuntimeError("timeout")
if len(extractor_fail) > 0:
for language in extractor_fail:
logging.error("%s extract fail, please check log", language)
logging.error("extract fail")
raise RuntimeError("extract fail")
else:
logging.info("extract success")
return
Empty file added cli/extractor/__init__.py
Empty file.
192 changes: 192 additions & 0 deletions cli/extractor/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import logging

import psutil

from run.runner import Runner
from sparrow_schema.schema import sparrow


class Extractor:
# 若添加抽取器,请命名为language_extractor,并完成language_extractor_cmd函数实现执行指令, 可被相应展示与执行
cfamily_extractor = ""
go_extractor = ""
java_extractor = ""
javascript_extractor = ""
properties_extractor = ""
python_extractor = ""
sql_extractor = ""
swift_extractor = ""
xml_extractor = ""

def __init__(self):
Extractor.cfamily_extractor = sparrow.home / "language" / "cfamily" / "extractor" / "usr" / "bin" / "coref-cfamily-src-extractor"
Extractor.go_extractor = sparrow.home / "language" / "go" / "extractor" / "coref-go-src-extractor"
Extractor.java_extractor = sparrow.home / "language" / "java" / "extractor" / "coref-java-src-extractor_deploy.jar"
Extractor.javascript_extractor = sparrow.home / "language" / "javascript" / "extractor" / "coref-javascript-src-extractor"
Extractor.properties_extractor = sparrow.home / "language" / "properties" / "extractor" / "coref-properties-src-extractor_deploy.jar"
Extractor.python_extractor = sparrow.home / "language" / "python" / "extractor" / "coref-python-src-extractor"
Extractor.sql_extractor = sparrow.home / "language" / "sql" / "extractor" / "coref-sql-src-extractor_deploy.jar"
Extractor.swift_extractor = sparrow.home / "language" / "swift" / "extractor" / "usr" / "bin" / "coref-swift-src-extractor"
Extractor.xml_extractor = sparrow.home / "language" / "xml" / "extractor" / "coref-xml-extractor_deploy.jar"


def cfamily_extractor_cmd(source_root, database, options):
cmd = list()
cmd += [str(Extractor.cfamily_extractor)]
cmd += ["--compile-commands=", str(source_root)]
cmd += ["--output-db-path=", str(database)]
return cmd


def go_extractor_cmd(source_root, database, options):
cmd = list()
cmd += [str(Extractor.go_extractor)]
if options:
for (key, value) in options.items():
if key == "extract-config":
for tmp in value.split(","):
cmd += ["-ex", tmp]
elif key == "go-build-flag":
for tmp in value.split(","):
cmd += [tmp]
else:
logging.warning("unsupported config name: %s for Go extractor.", key)
cmd += ["-o", str(database/"coref_go_src.db")]
cmd += [str(source_root)]
return cmd


def java_extractor_cmd(source_root, database, options):
cmd = list()
cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database)
if options:
for (key, value) in options.items():
if key == "white-list" or key == "whiteList":
cmd += ["-w=", value]
elif key == "cp":
cmd += ["-cp=", value]
elif key == "classpath":
cmd += ["--classpath=", value]
elif key == "incremental":
if value == "true":
cmd += ["--incremental"]
cmd += ["--cache-dir=" + options.get("cache-dir", "null")]
cmd += ["--commit=" + options.get("commit", "null")]
cmd += ["--remote-cache-type=" + options.get("remote-cache-type", "null")]
cmd += ["--oss-bucket=" + options.get("oss-bucket", "null")]
cmd += ["--oss-config-file=" + options.get("oss-config-file", "null")]
cmd += ["--oss-path-prefix=" + options.get("oss-path-prefix", "null")]
else:
logging.warning("java.incremental does not take effect, please use java.incremental=true")
else:
if key != "cache-dir" and key != "commit" and key != "remote-cache-type" and \
key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix":
logging.warning("unsupported config name:%s for java extractor.", key)
if "incremental" not in options or options["incremental"] != "true":
cmd += ["--parallel"]
return cmd


def javascript_extractor_cmd(source_root, database, options):
cmd = list()
cmd += [str(Extractor.javascript_extractor), "extract"] + \
["--src", str(source_root)] + \
["--db", str(database/"coref_javascript_src.db")]
if options:
for (key, value) in options.items():
if key == "black-list" or key == "blacklist":
cmd += ["--blacklist"]
for tmp in value.split(','):
cmd += [tmp]
elif key == "use-gitignore":
if value == "true":
cmd += ["--use-gitignore"]
else:
logging.warning("javascript.use-gitignore does not take effect, please use "
"javascript.use-gitignore=true")
elif key == "extract-dist":
if value == "true":
cmd += ["--extract-dist"]
else:
logging.warning("javascript.extract-dist does not take effect, please use "
"javascript.extract-dist=true")
elif key == "extract-deps":
if value == "true":
cmd += ["--extract-deps"]
else:
logging.warning("javascript.extract-deps does not take effect, please use "
"javascript.extract-deps=true")
elif key == "file-size-limit":
cmd += ["--file-size-limit", value]
else:
logging.warning("unsupported config name:%s for javascript extractor.", key)
return cmd


def properties_extractor_cmd(source_root, database, options):
cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database)
return cmd


def python_extractor_cmd(source_root, database, options):
cmd = list()
cmd += [str(Extractor.python_extractor), "-s", str(source_root), "-d", str(database)]
return cmd


def sql_extractor_cmd(source_root, database, options):
cmd = list()
cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database)
if "sql-dialect-type" in options:
cmd += ["--sql-dialect-type", options["sql-dialect-type"]]
return cmd


def swift_extractor(source_root, database, options):
cmd = list()
cmd += [str(Extractor.swift_extractor), str(source_root), str(database)]
if options:
for (key, value) in options.items():
if key == "corpus":
for tmp in value.split(","):
cmd += ["--corpus", tmp]
else:
logging.warning("unsupported config name:%s for Swift extractor.", key)
return cmd


def xml_extractor_cmd(source_root, database, options):
cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database)
return cmd


def jar_extractor_cmd(extractor_path, source_root, database):
# 获取内存信息
mem = psutil.virtual_memory()
total_memory = mem.total
total_memory_gb = round(total_memory / (1024 ** 3))
logging.info("current memory is : %s GB", total_memory_gb)
xmx = max(total_memory_gb - 1, 6)
logging.info("final -Xmx is: %s GB", xmx)
cmd = list()
cmd += ["java", "-jar", "-Xmx" + str(xmx) + "g", str(extractor_path)]
cmd += [str(source_root), str(database)]
return cmd


def extractor_run(language, source_root, database, timeout, options):
function_name = language + "_extractor_cmd"
if function_name in globals():
# 通过语言名确定函数名并直接调用对应抽取器执行函数,获取执行指令
function = globals()[function_name]
cmd = function(source_root, database, options)
if cmd == -1:
logging.error("option error")
logging.error("Failed to obtain the %s extractor", language)
return -1
tmp = Runner(cmd, timeout)
return tmp.subrun()
else:
logging.error("Not supported language: %s", language)
return -1

Empty file added cli/godel/__init__.py
Empty file.
Loading

0 comments on commit b3ddb88

Please sign in to comment.