-
-
Notifications
You must be signed in to change notification settings - Fork 114
WIP - POC with Chevrotain Parser. #142
Changes from 2 commits
9b5363b
be0544d
2bf7903
23b41c6
33df20d
b2d50a7
1dcee28
0c27008
b5ef657
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,265 @@ | ||
const chevrotain = require('chevrotain'); | ||
const _ = require('lodash'); | ||
|
||
|
||
const Lexer = chevrotain.Lexer; | ||
const Parser = chevrotain.Parser; | ||
|
||
// ----------------- lexer ----------------- | ||
const tokens = {}; | ||
|
||
function createToken(config) { | ||
tokens[config.name] = chevrotain.createToken(config); | ||
} | ||
|
||
createToken({ | ||
name: 'WhiteSpace', | ||
// TODO: uncertain why the original grammar disallowed newlines | ||
// TODO: in rules: "fieldDeclList: "and validationList" | ||
pattern: /\s+/, | ||
// HIGHLIGHT: | ||
// This special "group" causes the lexer to completely ignore | ||
// These tokens, In partical terms this means the parser | ||
// Does not have need to have hundreds of "SPACE*" everywhere (unlike pegjs). | ||
// With a single line we can make our language whitespace insensitive. | ||
group: Lexer.SKIPPED, | ||
line_breaks: true | ||
}); | ||
|
||
// Comments | ||
createToken({ | ||
name: 'Comment', | ||
pattern: /\/\*[^]*?\*\//, | ||
// HIGHLIGHT: | ||
// By using the "group" option, the comments | ||
// will be collected into a separate array property | ||
// This means comments can do both (unlike pegjs): | ||
// 1. Appear anywhere. | ||
// 2. can be completely ignored when implementing the grammar. | ||
group: 'comments', | ||
line_breaks: true | ||
}); | ||
|
||
// Constants | ||
// Application constants | ||
createToken({ name: 'APPLICATION', pattern: 'application' }); | ||
createToken({ name: 'BASE_NAME', pattern: 'baseName' }); | ||
createToken({ name: 'PATH', pattern: 'path' }); | ||
createToken({ name: 'PACKAGE_NAME', pattern: 'packageName' }); | ||
createToken({ name: 'AUTHENTICATION_TYPE', pattern: 'authenticationType' }); | ||
createToken({ name: 'HIBERNATE_CACHE', pattern: 'hibernateCache' }); | ||
createToken({ name: 'CLUSTERED_HTTP_SESSION', pattern: 'clusteredHttpSession' }); | ||
createToken({ name: 'WEBSOCKET', pattern: 'websocket' }); | ||
createToken({ name: 'DATABASE_TYPE', pattern: 'databaseType' }); | ||
createToken({ name: 'DEV_DATABASE_TYPE', pattern: 'devDatabaseType' }); | ||
createToken({ name: 'PROD_DATABASE_TYPE', pattern: 'prodDatabaseType' }); | ||
createToken({ name: 'USE_COMPASS', pattern: 'useCompass' }); | ||
createToken({ name: 'BUILD_TOOL', pattern: 'buildTool' }); | ||
createToken({ name: 'SEARCH_ENGINE', pattern: 'searchEngine' }); | ||
createToken({ name: 'ENABLE_TRANSLATION', pattern: 'enableTranslation' }); | ||
createToken({ name: 'APPLICATION_TYPE', pattern: 'applicationType' }); | ||
createToken({ name: 'TEST_FRAMEWORK', pattern: 'testFrameworks' }); | ||
createToken({ name: 'LANGUAGES', pattern: 'languages' }); | ||
createToken({ name: 'SERVER_PORT', pattern: 'serverPort' }); | ||
createToken({ name: 'ENABLE_SOCIAL_SIGN_IN', pattern: 'enableSocialSignIn' }); | ||
createToken({ name: 'USE_SASS', pattern: 'useSass' }); | ||
createToken({ name: 'JHI_PREFIX', pattern: 'jhiPrefix' }); | ||
createToken({ name: 'MESSAGE_BROKER', pattern: 'messageBroker' }); | ||
createToken({ name: 'SERVICE_DISCOVERY_TYPE', pattern: 'serviceDiscoveryType' }); | ||
createToken({ name: 'CLIENT_PACKAGE_MANAGER', pattern: 'clientPackageManager' }); | ||
createToken({ name: 'CLIENT_FRAMEWORK', pattern: 'clientFramework' }); | ||
createToken({ name: 'NATIVE_LANGUAGE', pattern: 'nativeLanguage' }); | ||
createToken({ name: 'FRONT_END_BUILDER', pattern: 'frontendBuilder' }); | ||
createToken({ name: 'SKIP_USER_MANAGEMENT', pattern: 'skipUserManagement' }); | ||
// skipClient & skipServer are already defined | ||
createToken({ name: 'TRUE', pattern: 'true' }); | ||
createToken({ name: 'FALSE', pattern: 'false' }); | ||
// Entity constants | ||
createToken({ name: 'ENTITY', pattern: 'entity' }); | ||
createToken({ name: 'RELATIONSHIP', pattern: 'relationship' }); | ||
createToken({ name: 'ENUM', pattern: 'enum' }); | ||
// Relationship types | ||
createToken({ name: 'ONE_TO_ONE', pattern: 'OneToOne' }); | ||
createToken({ name: 'ONE_TO_MANY', pattern: 'OneToMany' }); | ||
createToken({ name: 'MANY_TO_ONE', pattern: 'ManyToOne' }); | ||
createToken({ name: 'MANY_TO_MANY', pattern: 'ManyToMany' }); | ||
|
||
// Options | ||
createToken({ name: 'ALL', pattern: 'all' }); | ||
createToken({ name: 'STAR', pattern: '*' }); | ||
createToken({ name: 'FOR', pattern: 'for' }); | ||
createToken({ name: 'WITH', pattern: 'with' }); | ||
createToken({ name: 'EXCEPT', pattern: 'except' }); | ||
createToken({ name: 'NO_FLUENT_METHOD', pattern: 'noFluentMethod' }); | ||
createToken({ name: 'DTO', pattern: 'dto' }); | ||
createToken({ name: 'PAGINATE', pattern: 'paginate' }); | ||
createToken({ name: 'SERVICE', pattern: 'service' }); | ||
createToken({ name: 'MICROSERVICE', pattern: 'microservice' }); | ||
createToken({ name: 'SEARCH', pattern: 'search' }); | ||
createToken({ name: 'SKIP_CLIENT', pattern: 'skipClient' }); | ||
createToken({ name: 'SKIP_SERVER', pattern: 'skipServer' }); | ||
createToken({ name: 'ANGULAR_SUFFIX', pattern: 'angularSuffix' }); | ||
|
||
// validations | ||
createToken({ name: 'REQUIRED', pattern: 'required' }); | ||
|
||
// HIGHLIGHT: | ||
// "MIN_MAX_KEYWORD" is an "abstract" token which other concrete tokens inherit from. | ||
// This can be used to reduce verbosity in the parser. | ||
createToken({ name: 'MIN_MAX_KEYWORD', pattern: Lexer.NA }); | ||
createToken({ name: 'MINLENGTH', pattern: 'minlength', parent: tokens.MIN_MAX_KEYWORD }); | ||
createToken({ name: 'MAXLENGTH', pattern: 'maxlength', parent: tokens.MIN_MAX_KEYWORD }); | ||
createToken({ name: 'MINBYTES', pattern: 'minbytes', parent: tokens.MIN_MAX_KEYWORD }); | ||
createToken({ name: 'MAXBYTES', pattern: 'maxbytes', parent: tokens.MIN_MAX_KEYWORD }); | ||
createToken({ name: 'MAX', pattern: 'max', parent: tokens.MIN_MAX_KEYWORD }); | ||
createToken({ name: 'MIN', pattern: 'min', parent: tokens.MIN_MAX_KEYWORD }); | ||
|
||
createToken({ name: 'PATTERN', pattern: 'pattern' }); | ||
|
||
createToken({ name: 'REGEX', pattern: /\/[^\n\r/]*\// }); | ||
createToken({ name: 'INTEGER', pattern: /-?\d+/ }); | ||
|
||
// All the names tokens of the pegjs implementation have been merged into | ||
// a single token type. This is because they would cause ambiguities | ||
// when the lexing stage is separated from the parsing stage. | ||
// They restrictions on the names should be implemented as semantic checks | ||
// That approach could also provide a better experience in an Editor | ||
// As semantic checks don't require fault tolerance and recovery like | ||
// syntax errors do. | ||
// TODO: looks like the parenthesis should not be part of the name, but a suffix, eg: "maxlength(25)" | ||
// TODO: because if it is part of the name than this is also valid "max))((Length()1)))"... | ||
createToken({ name: 'NAME', pattern: /[a-zA-Z_][a-zA-Z_\d()]*/ }); | ||
|
||
// punctuation | ||
createToken({ name: 'LPAREN', pattern: '(' }); | ||
createToken({ name: 'RPAREN', pattern: ')' }); | ||
createToken({ name: 'LCURLY', pattern: '{' }); | ||
createToken({ name: 'RCURLY', pattern: '}' }); | ||
createToken({ name: 'LSQUARE', pattern: '[' }); | ||
createToken({ name: 'RSQUARE', pattern: ']' }); | ||
createToken({ name: 'COMMA', pattern: ',' }); | ||
createToken({ name: 'COLON', pattern: ':' }); | ||
createToken({ name: 'EQUALS', pattern: '=' }); | ||
createToken({ name: 'DOT', pattern: '.' }); | ||
|
||
// TODO: The debug flag should not be enabled in productive envs for performance reasons. | ||
// It is useful to help debug the token vector results. | ||
const JDLLexer = new Lexer(_.values(tokens), { debug: true }); | ||
|
||
// short prefix to reduce verbosity. | ||
const t = tokens; | ||
|
||
class JDLParser extends Parser { | ||
constructor(input) { | ||
super(input, tokens); | ||
|
||
const $ = this; | ||
|
||
// HIGHLIGHTS1: Any rule may be used as a start rule, there is no artificial limit | ||
// like in pegjs. This capability is useful for partial parsing, e.g.: | ||
// 1. Code snippets | ||
// 2. Incremental parsing of only the changed parts of an active Editor. | ||
// 3. writing Unit tests for micro code samples. | ||
$.RULE('prog', () => { | ||
$.OR([ | ||
{ ALT: () => { $.SUBRULE($.constantDecl); } }, | ||
{ ALT: () => { $.SUBRULE($.entityDecl); } } | ||
]); | ||
}); | ||
|
||
$.RULE('constantDecl', () => { | ||
$.CONSUME(t.NAME); | ||
$.CONSUME(t.EQUALS); | ||
$.CONSUME(t.INTEGER); | ||
}); | ||
|
||
$.RULE('entityDecl', () => { | ||
$.CONSUME(t.ENTITY); | ||
$.CONSUME(t.NAME); | ||
|
||
$.OPTION(() => { | ||
$.SUBRULE($.entityTableNameDecl); | ||
}); | ||
|
||
// the "2" suffix is a quirk of Chevrotain, more details: | ||
// https://github.com/SAP/chevrotain/blob/master/docs/faq.md#-why-are-the-unique-numerical-suffixes-consume1consume2-needed-for-the-dsl-rules | ||
$.OPTION2(() => { | ||
$.SUBRULE($.entityBody); | ||
}); | ||
}); | ||
|
||
$.RULE('entityTableNameDecl', () => { | ||
$.CONSUME(t.LPAREN); | ||
$.CONSUME(t.NAME); | ||
$.CONSUME(t.RPAREN); | ||
}); | ||
|
||
$.RULE('entityBody', () => { | ||
$.CONSUME(t.LCURLY); | ||
$.AT_LEAST_ONE(() => { | ||
$.SUBRULE($.fieldDec); | ||
}); | ||
$.CONSUME(t.RCURLY); | ||
}); | ||
|
||
$.RULE('fieldDec', () => { | ||
$.CONSUME(t.NAME); | ||
$.SUBRULE($.type); | ||
// Short form for: "(X(,X)*)?" | ||
$.MANY_SEP({ | ||
SEP: t.COMMA, | ||
DEF: () => { | ||
$.SUBRULE($.validation); | ||
} | ||
}); | ||
$.CONSUME(t.RCURLY); | ||
}); | ||
|
||
$.RULE('type', () => { | ||
$.CONSUME($.NAME); | ||
}); | ||
|
||
$.RULE('validation', () => { | ||
$.OR([ | ||
{ ALT: () => { $.CONSUME(t.REQUIRED); } }, | ||
{ ALT: () => { $.SUBRULE($.minMaxValidation); } }, | ||
{ ALT: () => { $.SUBRULE($.pattern); } } | ||
]); | ||
}); | ||
|
||
$.RULE('minMaxValidation', () => { | ||
// HIGHLIGHT: | ||
// Note that "MIN_MAX_KEYWORD" is an abstract token and could match 6 different concrete token types | ||
$.CONSUME(t.MIN_MAX_KEYWORD); | ||
$.CONSUME(t.LPAREN); | ||
$.OR([ | ||
{ ALT: () => { $.CONSUME(t.INTEGER); } }, | ||
{ ALT: () => { $.CONSUME(t.NAME); } } | ||
]); | ||
$.CONSUME(t.RPAREN); | ||
}); | ||
|
||
$.RULE('pattern', () => { | ||
$.CONSUME(t.PATTERN); | ||
$.CONSUME(t.LPAREN); | ||
// HIGHLIGHT: | ||
// With Chevrotain the grammar can be debugged directly by using good old fashioned breakpoints. | ||
// No need to to try and figure out a 10,000 lines generated file, or worse not even have that | ||
// if we would be use some JS combinator. | ||
// debugger; | ||
$.CONSUME(t.REGEX); | ||
$.CONSUME(t.RPAREN); | ||
}); | ||
|
||
// very important to call this after all the rules have been defined. | ||
// otherwise the parser may not work correctly as it will lack information | ||
// derived during the self analysis phase. | ||
Parser.performSelfAnalysis(this); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is during the object's construction-time. All this. Why not having another way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
AnswerThe syntax I prefer relies on using class fields ESNext syntax. TypeScript has something similar which already works now. AlternativeAnyhow as it is all just plain JavaScript you can define it (mostly) however you want... I'm am a bit too tired to think a concrete alternative syntax right now.
And if it helps normally you only use a single parser instance and reset it's internal state before each use. Future / Long term.There is also an open issue for better support of custom APIs for building Chevrotain parsers.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is a really quick and dirty factory style hack. And here are the rules defined outside the constructor. I don't think this should be part of Chevrotain's official API Also note that this factory mixes in the rules, so they could easily be split up Hope this example demonstrates how due to Chevrotain being a library |
||
} | ||
} | ||
|
||
module.exports = { | ||
tokens, | ||
JDLLexer, | ||
JDLParser | ||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,7 @@ | |
} | ||
], | ||
"dependencies": { | ||
"chevrotain": "^0.32.1", | ||
"lodash": "4.17.4", | ||
"winston": "2.3.1" | ||
}, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/* eslint-disable no-new, no-unused-expressions */ | ||
const expect = require('chai').expect; | ||
const lexerModule = require('../../../lib/dsl/chev_grammar'); | ||
|
||
const JDLLexer = lexerModule.JDLLexer; | ||
|
||
describe('Chevrotain Lexer POC', () => { | ||
it('Can lex a simple valid JDL text', () => { | ||
const input = ` | ||
/** | ||
* outer comment. | ||
*/ | ||
entity JobHistory { | ||
startDate ZonedDateTime, | ||
/* inner comment */ | ||
endDate ZonedDateTime, | ||
language Language | ||
}`; | ||
|
||
const lexResult = JDLLexer.tokenize(input); | ||
expect(lexResult.errors).to.be.empty; | ||
|
||
const comments = lexResult.groups.comments; | ||
expect(comments[0].image).to.include('outer comment.'); | ||
expect(comments[0].startLine).to.equal(2); | ||
expect(comments[0].endLine).to.equal(4); | ||
expect(comments[1].image).to.include('inner comment'); | ||
expect(comments[1].startLine).to.equal(7); | ||
expect(comments[1].endLine).to.equal(7); | ||
|
||
const tokens = lexResult.tokens; | ||
expect(tokens.length).to.equal(12); | ||
expect(tokens[0].image).to.equal('entity'); | ||
expect(tokens[1].image).to.equal('JobHistory'); | ||
expect(tokens[2].image).to.equal('{'); | ||
expect(tokens[3].image).to.equal('startDate'); | ||
expect(tokens[4].image).to.equal('ZonedDateTime'); | ||
expect(tokens[5].image).to.equal(','); | ||
expect(tokens[6].image).to.equal('endDate'); | ||
expect(tokens[7].image).to.equal('ZonedDateTime'); | ||
expect(tokens[8].image).to.equal(','); | ||
expect(tokens[9].image).to.equal('language'); | ||
expect(tokens[10].image).to.equal('Language'); | ||
expect(tokens[11].image).to.equal('}'); | ||
}); | ||
|
||
it('Can lex a simple IN-valid JDL text', () => { | ||
const input = ` | ||
entity JobHistory { | ||
startDate ZonedDateTime, | ||
@@@ /* invalid token but the lexing should continue */ | ||
endDate ZonedDateTime | ||
}`; | ||
const lexResult = JDLLexer.tokenize(input); | ||
const errors = lexResult.errors; | ||
expect(errors).to.have.lengthOf(1); | ||
expect(errors[0].line).to.equal(4); | ||
expect(errors[0].column).to.equal(6); | ||
expect(errors[0].message).to.include('@'); | ||
expect(errors[0].message).to.include('skipped 3 characters'); | ||
|
||
expect(lexResult.tokens).to.have.lengthOf(9, | ||
'All 9 tokens should have been lexed even thought "@@@" caused a syntax error'); | ||
}); | ||
}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How so?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is used in the validation rule instead of specifying the six different keywords.
https://github.com/jhipster/jhipster-core/pull/142/files#diff-802ee05eaf770a8bbbc2fe7ef13a3efaR233
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here is the corresponding section in the existing grammar:
jhipster-core/lib/dsl/grammar.txt
Lines 511 to 522 in fd8f712
The token inheritance does not have to be used.
It is an example for what is possible and could be considered...