Skip to content

Commit

Permalink
Merge pull request #5 from elliotchance/ast2json
Browse files Browse the repository at this point in the history
Ast2json
  • Loading branch information
elliotchance authored Mar 25, 2017
2 parents 472f2ad + d77e673 commit 7539b75
Show file tree
Hide file tree
Showing 3 changed files with 343 additions and 186 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
/pp.c
/out.go
/a.out
/pp.json
149 changes: 149 additions & 0 deletions ast2json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import sys
import re
import json

# This script converts the output of clang AST into a JSON file.
#
# Usage:
# clang -Xclang -ast-dump -fsyntax-only myfile.c | python ast2json.py
#
# Yes, there are many better ways to do this. However I chose this method
# because:
#
# 1. I need to separate the clang AST from the c2go conversion process so that
# the c2go program can ingest a reliable JSON file and not depend on clang or
# its different versions at all.
# 2. The clang API is not stable and trying to match up binaries with different
# versions and operating systems can be tricky and brittle.
# 3. This tool, in time, will become a better binary of some kind that produces
# much the same JSON output (so minimal changes to c2go.py).
# 4. I needed something quick and dirty to proof the complete toolchain and get
# it working on different versions of clang and different operating systems
# before we enough information to really standardise the process.

regex = {
'AlwaysInlineAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> always_inline",
'ArraySubscriptExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<tags>.*)",
'AsmLabelAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \"(?P<function>.+)\"",
'AvailabilityAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<os>\w+) (?P<version>[\d.]+) (?P<unknown1>[\d.]+) (?P<unknown2>[\d.]+) (?P<unknown3>\".*?\"|\w+) (?P<unknown4>\".*?\"|\w+)",
'BinaryOperator': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' '(?P<operator>.*?)'",
'BreakStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
'BuiltinType': r'^ (?P<address>[0-9a-fx]+) \'(?P<name>.*)\'',
'CallExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*?)\'',
'CharacterLiteral': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<value>\d+)",
'CompoundStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>',
'ConstantArrayType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<size>\d+)',
'CStyleCastExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' <(?P<kind>.*)>",
'DeclRefExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)'.*? (lvalue (?P<kind>\w+)|Function) (?P<address2>[0-9a-fx]+) '(?P<name>.*?)' '(?P<type2>.*?)'",
'DeclStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
'DeprecatedAttr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \"(?P<message1>.*?)\" \"(?P<message2>.*?)\"",
'ElaboratedType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<tags>.+)',
'FieldDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<tags>.*?)(?P<name>\w+?) '(?P<type>.+?)'",
'FloatingLiteral': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*)' (?P<value>.+)",
'FormatAttr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>(?P<tags> Implicit)? (?P<function>\w+) (?P<unknown1>\d+) (?P<unknown2>\d+)',
'ForStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
'FunctionDecl': r"^ (?P<address>[0-9a-fx]+) (?P<prev>prev [0-9a-fx]+)? ?<(?P<position1>.*)> (?P<position2>[^ ]+)(?P<tags1> implicit)?(?P<tags2> used)? (?P<name>\w+) '(?P<type>.*)'(?P<tags3> extern)?",
'IfStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>',
'ImplicitCastExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\' <(?P<kind>.*)>',
'IntegerLiteral': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\' (?P<value>.+)',
'MemberExpr': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)' (?P<tags>.*?)(?P<name>\w+) (?P<address2>[0-9a-fx]+)",
'ParenExpr': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*?)\'',
'ParmVarDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>.+?)(?P<name> \w+)? '(?P<type>.*?)'(?P<type2>:'.*?')?",
'PointerType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'Record': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'RecordDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<kind>struct|union) (?P<name>\w+)",
'RecordType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'ReturnStmt': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>',
'StringLiteral': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> \'(?P<type>.*)\'(?P<tags> lvalue)? (?P<value>.*)',
'TranslationUnitDecl': r'^ (?P<address>[0-9a-fx]+)',
'Typedef': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\'',
'TypedefDecl': r'^ (?P<address>[0-9a-fx]+) <(?P<position>.+?)> (?P<position2><invalid sloc>|[^ ]+)(?P<tags>.*?) (?P<name>\w+) \'(?P<type>.*?)\'(?P<type2>:\'.*?\')?',
'TypedefType': r'^ (?P<address>[0-9a-fx]+) \'(?P<type>.*)\' (?P<tags>.+)',
'UnaryOperator': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> '(?P<type>.*?)'(?P<tags1> lvalue)?(?P<tags2> prefix)?(?P<tags3> postfix)? '(?P<operator>.*?)'",
'VarDecl': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)> (?P<position2>[^ ]+) (?P<name>.+) '(?P<type>.+?)'.*?(?P<tags>.*)",
'WhileStmt': r"^ (?P<address>[0-9a-fx]+) <(?P<position>.*)>",
}

def build_tree(nodes, depth):
"""Convert an array of nodes, each prefixed with a depth into a tree."""
if len(nodes) == 0:
return []

# Split the list into sections, treat each section as a a tree with its own
# root.
sections = []
for node in nodes:
if node[0] == depth:
sections.append([node])
else:
sections[-1].append(node)

results = []
for section in sections:
children = build_tree([n for n in section if n[0] > depth], depth + 1)
result = section[0][1]

if len(children) > 0:
result['children'] = children

results.append(result)

return results

def read_ast():
stdin = sys.stdin.read()
uncolored = re.sub(r'\x1b\[[\d;]+m', '', stdin)
return uncolored.split("\n")

def convert_lines_to_nodes(lines):
nodes = []
for line in lines:
if line.strip() == '':
continue

# This will need to be handled more gracefully... I'm not even sure
# what this means?
if '<<<NULL>>>' in line:
continue

indent_and_type = re.search(r'^([|\- `]*)(\w+)', line)
if indent_and_type is None:
print("Can not understand line '%s'" % line)
sys.exit(1)

node_type = indent_and_type.group(2)
# if node_type == 'FieldDecl':
# print(line[offset:])

offset = len(indent_and_type.group(0))
try:
result = re.search(regex[node_type], line[offset:])
except KeyError:
print("There is no regex for '%s'." % node_type)
print("I will print out all the lines so a regex can be created:\n")

for line in lines:
s = re.search(r'^([|\- `]*)(\w+)', line)
if s is not None and node_type == s.group(2):
print(line[offset:])

sys.exit(1)

if result is None:
print("Can not understand line '%s'" % line)
sys.exit(1)

node = result.groupdict()

node['node'] = node_type

indent_level = len(indent_and_type.group(1)) / 2
nodes.append([indent_level, node])

return nodes

lines = read_ast()
nodes = convert_lines_to_nodes(lines)
tree = build_tree(nodes, 0)

print(json.dumps(tree, sort_keys=True, indent=2, separators=(',', ': ')))
Loading

0 comments on commit 7539b75

Please sign in to comment.