-
Notifications
You must be signed in to change notification settings - Fork 159
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from elliotchance/ast2json
Ast2json
- Loading branch information
Showing
3 changed files
with
343 additions
and
186 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,4 @@ | |
/pp.c | ||
/out.go | ||
/a.out | ||
/pp.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
import sys | ||
import re | ||
import json | ||
|
||
# This script converts the output of clang AST into a JSON file. | ||
# | ||
# Usage: | ||
# clang -Xclang -ast-dump -fsyntax-only myfile.c | python ast2json.py | ||
# | ||
# Yes, there are many better ways to do this. However I chose this method | ||
# because: | ||
# | ||
# 1. I need to separate the clang AST from the c2go conversion process so that | ||
# the c2go program can ingest a reliable JSON file and not depend on clang or | ||
# its different versions at all. | ||
# 2. The clang API is not stable and trying to match up binaries with different | ||
# versions and operating systems can be tricky and brittle. | ||
# 3. This tool, in time, will become a better binary of some kind that produces | ||
# much the same JSON output (so minimal changes to c2go.py). | ||
# 4. I needed something quick and dirty to proof the complete toolchain and get | ||
# it working on different versions of clang and different operating systems | ||
# before we enough information to really standardise the process. | ||
|
||
regex = { | ||
'AlwaysInlineAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> always_inline", | ||
'ArraySubscriptExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<tags>.*)", | ||
'AsmLabelAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \"(?P<function>.+)\"", | ||
'AvailabilityAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<os>\w+) (?P<version>[\d.]+) (?P<unknown1>[\d.]+) (?P<unknown2>[\d.]+) (?P<unknown3>\".*?\"|\w+) (?P<unknown4>\".*?\"|\w+)", | ||
'BinaryOperator': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' '(?P<operator>.*?)'", | ||
'BreakStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>", | ||
'BuiltinType': r'^ (?P<address>[0-9a-fx]+) \'(?P<name>.*)\'', | ||
'CallExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*?)\'', | ||
'CharacterLiteral': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<value>\d+)", | ||
'CompoundStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>', | ||
'ConstantArrayType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<size>\d+)', | ||
'CStyleCastExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' <(?P<kind>.*)>", | ||
'DeclRefExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)'.*? (lvalue (?P<kind>\w+)|Function) (?P<address2>[0-9a-fx]+) '(?P<name>.*?)' '(?P<type2>.*?)'", | ||
'DeclStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>", | ||
'DeprecatedAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \"(?P<message1>.*?)\" \"(?P<message2>.*?)\"", | ||
'ElaboratedType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<tags>.+)', | ||
'FieldDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<tags>.*?)(?P<name>\w+?) '(?P<type>.+?)'", | ||
'FloatingLiteral': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*)' (?P<value>.+)", | ||
'FormatAttr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>(?P<tags> Implicit)? (?P<function>\w+) (?P<unknown1>\d+) (?P<unknown2>\d+)', | ||
'ForStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>", | ||
'FunctionDecl': r"^ (?P<address>[0-9a-fx]+) (?P<prev>prev [0-9a-fx]+)? ?<(?P<position1>.*)> (?P<position2>[^ ]+)(?P<tags1> implicit)?(?P<tags2> used)? (?P<name>\w+) '(?P<type>.*)'(?P<tags3> extern)?", | ||
'IfStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>', | ||
'ImplicitCastExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\' <(?P<kind>.*)>', | ||
'IntegerLiteral': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\' (?P<value>.+)', | ||
'MemberExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<tags>.*?)(?P<name>\w+) (?P<address2>[0-9a-fx]+)", | ||
'ParenExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*?)\'', | ||
'ParmVarDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>.+?)(?P<name> \w+)? '(?P<type>.*?)'(?P<type2>:'.*?')?", | ||
'PointerType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'', | ||
'Record': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'', | ||
'RecordDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<kind>struct|union) (?P<name>\w+)", | ||
'RecordType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'', | ||
'ReturnStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>', | ||
'StringLiteral': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\'(?P<tags> lvalue)? (?P<value>.*)', | ||
'TranslationUnitDecl': r'^ (?P<address>[0-9a-fx]+)', | ||
'Typedef': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'', | ||
'TypedefDecl': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.+?)> (?P<position2><invalid sloc>|[^ ]+)(?P<tags>.*?) (?P<name>\w+) \'(?P<type>.*?)\'(?P<type2>:\'.*?\')?', | ||
'TypedefType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<tags>.+)', | ||
'UnaryOperator': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)'(?P<tags1> lvalue)?(?P<tags2> prefix)?(?P<tags3> postfix)? '(?P<operator>.*?)'", | ||
'VarDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<name>.+) '(?P<type>.+?)'.*?(?P<tags>.*)", | ||
'WhileStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>", | ||
} | ||
|
||
def build_tree(nodes, depth): | ||
"""Convert an array of nodes, each prefixed with a depth into a tree.""" | ||
if len(nodes) == 0: | ||
return [] | ||
|
||
# Split the list into sections, treat each section as a a tree with its own | ||
# root. | ||
sections = [] | ||
for node in nodes: | ||
if node[0] == depth: | ||
sections.append([node]) | ||
else: | ||
sections[-1].append(node) | ||
|
||
results = [] | ||
for section in sections: | ||
children = build_tree([n for n in section if n[0] > depth], depth + 1) | ||
result = section[0][1] | ||
|
||
if len(children) > 0: | ||
result['children'] = children | ||
|
||
results.append(result) | ||
|
||
return results | ||
|
||
def read_ast(): | ||
stdin = sys.stdin.read() | ||
uncolored = re.sub(r'\x1b\[[\d;]+m', '', stdin) | ||
return uncolored.split("\n") | ||
|
||
def convert_lines_to_nodes(lines): | ||
nodes = [] | ||
for line in lines: | ||
if line.strip() == '': | ||
continue | ||
|
||
# This will need to be handled more gracefully... I'm not even sure | ||
# what this means? | ||
if '<<<NULL>>>' in line: | ||
continue | ||
|
||
indent_and_type = re.search(r'^([|\- `]*)(\w+)', line) | ||
if indent_and_type is None: | ||
print("Can not understand line '%s'" % line) | ||
sys.exit(1) | ||
|
||
node_type = indent_and_type.group(2) | ||
# if node_type == 'FieldDecl': | ||
# print(line[offset:]) | ||
|
||
offset = len(indent_and_type.group(0)) | ||
try: | ||
result = re.search(regex[node_type], line[offset:]) | ||
except KeyError: | ||
print("There is no regex for '%s'." % node_type) | ||
print("I will print out all the lines so a regex can be created:\n") | ||
|
||
for line in lines: | ||
s = re.search(r'^([|\- `]*)(\w+)', line) | ||
if s is not None and node_type == s.group(2): | ||
print(line[offset:]) | ||
|
||
sys.exit(1) | ||
|
||
if result is None: | ||
print("Can not understand line '%s'" % line) | ||
sys.exit(1) | ||
|
||
node = result.groupdict() | ||
|
||
node['node'] = node_type | ||
|
||
indent_level = len(indent_and_type.group(1)) / 2 | ||
nodes.append([indent_level, node]) | ||
|
||
return nodes | ||
|
||
lines = read_ast() | ||
nodes = convert_lines_to_nodes(lines) | ||
tree = build_tree(nodes, 0) | ||
|
||
print(json.dumps(tree, sort_keys=True, indent=2, separators=(',', ': '))) |
Oops, something went wrong.