Skip to content
This repository has been archived by the owner on Mar 21, 2021. It is now read-only.

WIP - POC with Chevrotain Parser. #142

Merged
merged 9 commits into from
Sep 23, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 265 additions & 0 deletions lib/dsl/chev_grammar.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
const chevrotain = require('chevrotain');
const _ = require('lodash');


const Lexer = chevrotain.Lexer;
const Parser = chevrotain.Parser;

// ----------------- lexer -----------------
const tokens = {};

function createToken(config) {
tokens[config.name] = chevrotain.createToken(config);
}

createToken({
name: 'WhiteSpace',
// TODO: uncertain why the original grammar disallowed newlines
// TODO: in rules: "fieldDeclList: "and validationList"
pattern: /\s+/,
// HIGHLIGHT:
// This special "group" causes the lexer to completely ignore
// These tokens, In partical terms this means the parser
// Does not have need to have hundreds of "SPACE*" everywhere (unlike pegjs).
// With a single line we can make our language whitespace insensitive.
group: Lexer.SKIPPED,
line_breaks: true
});

// Comments
createToken({
name: 'Comment',
pattern: /\/\*[^]*?\*\//,
// HIGHLIGHT:
// By using the "group" option, the comments
// will be collected into a separate array property
// This means comments can do both (unlike pegjs):
// 1. Appear anywhere.
// 2. can be completely ignored when implementing the grammar.
group: 'comments',
line_breaks: true
});

// Constants
// Application constants
createToken({ name: 'APPLICATION', pattern: 'application' });
createToken({ name: 'BASE_NAME', pattern: 'baseName' });
createToken({ name: 'PATH', pattern: 'path' });
createToken({ name: 'PACKAGE_NAME', pattern: 'packageName' });
createToken({ name: 'AUTHENTICATION_TYPE', pattern: 'authenticationType' });
createToken({ name: 'HIBERNATE_CACHE', pattern: 'hibernateCache' });
createToken({ name: 'CLUSTERED_HTTP_SESSION', pattern: 'clusteredHttpSession' });
createToken({ name: 'WEBSOCKET', pattern: 'websocket' });
createToken({ name: 'DATABASE_TYPE', pattern: 'databaseType' });
createToken({ name: 'DEV_DATABASE_TYPE', pattern: 'devDatabaseType' });
createToken({ name: 'PROD_DATABASE_TYPE', pattern: 'prodDatabaseType' });
createToken({ name: 'USE_COMPASS', pattern: 'useCompass' });
createToken({ name: 'BUILD_TOOL', pattern: 'buildTool' });
createToken({ name: 'SEARCH_ENGINE', pattern: 'searchEngine' });
createToken({ name: 'ENABLE_TRANSLATION', pattern: 'enableTranslation' });
createToken({ name: 'APPLICATION_TYPE', pattern: 'applicationType' });
createToken({ name: 'TEST_FRAMEWORK', pattern: 'testFrameworks' });
createToken({ name: 'LANGUAGES', pattern: 'languages' });
createToken({ name: 'SERVER_PORT', pattern: 'serverPort' });
createToken({ name: 'ENABLE_SOCIAL_SIGN_IN', pattern: 'enableSocialSignIn' });
createToken({ name: 'USE_SASS', pattern: 'useSass' });
createToken({ name: 'JHI_PREFIX', pattern: 'jhiPrefix' });
createToken({ name: 'MESSAGE_BROKER', pattern: 'messageBroker' });
createToken({ name: 'SERVICE_DISCOVERY_TYPE', pattern: 'serviceDiscoveryType' });
createToken({ name: 'CLIENT_PACKAGE_MANAGER', pattern: 'clientPackageManager' });
createToken({ name: 'CLIENT_FRAMEWORK', pattern: 'clientFramework' });
createToken({ name: 'NATIVE_LANGUAGE', pattern: 'nativeLanguage' });
createToken({ name: 'FRONT_END_BUILDER', pattern: 'frontendBuilder' });
createToken({ name: 'SKIP_USER_MANAGEMENT', pattern: 'skipUserManagement' });
// skipClient & skipServer are already defined
createToken({ name: 'TRUE', pattern: 'true' });
createToken({ name: 'FALSE', pattern: 'false' });
// Entity constants
createToken({ name: 'ENTITY', pattern: 'entity' });
createToken({ name: 'RELATIONSHIP', pattern: 'relationship' });
createToken({ name: 'ENUM', pattern: 'enum' });
// Relationship types
createToken({ name: 'ONE_TO_ONE', pattern: 'OneToOne' });
createToken({ name: 'ONE_TO_MANY', pattern: 'OneToMany' });
createToken({ name: 'MANY_TO_ONE', pattern: 'ManyToOne' });
createToken({ name: 'MANY_TO_MANY', pattern: 'ManyToMany' });

// Options
createToken({ name: 'ALL', pattern: 'all' });
createToken({ name: 'STAR', pattern: '*' });
createToken({ name: 'FOR', pattern: 'for' });
createToken({ name: 'WITH', pattern: 'with' });
createToken({ name: 'EXCEPT', pattern: 'except' });
createToken({ name: 'NO_FLUENT_METHOD', pattern: 'noFluentMethod' });
createToken({ name: 'DTO', pattern: 'dto' });
createToken({ name: 'PAGINATE', pattern: 'paginate' });
createToken({ name: 'SERVICE', pattern: 'service' });
createToken({ name: 'MICROSERVICE', pattern: 'microservice' });
createToken({ name: 'SEARCH', pattern: 'search' });
createToken({ name: 'SKIP_CLIENT', pattern: 'skipClient' });
createToken({ name: 'SKIP_SERVER', pattern: 'skipServer' });
createToken({ name: 'ANGULAR_SUFFIX', pattern: 'angularSuffix' });

// validations
createToken({ name: 'REQUIRED', pattern: 'required' });

// HIGHLIGHT:
// "MIN_MAX_KEYWORD" is an "abstract" token which other concrete tokens inherit from.
// This can be used to reduce verbosity in the parser.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How so?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is used in the validation rule instead of specifying the six different keywords.
https://github.com/jhipster/jhipster-core/pull/142/files#diff-802ee05eaf770a8bbbc2fe7ef13a3efaR233

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the corresponding section in the existing grammar:

/ MINLENGTH SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'minlength', value: int }; }
/ MINLENGTH SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'minlength', value: constantName, constant: true }; }
/ MAXLENGTH SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'maxlength', value: int }; }
/ MAXLENGTH SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'maxlength', value: constantName, constant: true }; }
/ MINBYTES SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'minbytes', value: int }; }
/ MINBYTES SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'minbytes', value: constantName, constant: true }; }
/ MAXBYTES SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'maxbytes', value: int }; }
/ MAXBYTES SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'maxbytes', value: constantName, constant: true }; }
/ MIN SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'min', value: int };}
/ MIN SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'min', value: constantName, constant: true }; }
/ MAX SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'max', value: int };}
/ MAX SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'max', value: constantName, constant: true }; }

The token inheritance does not have to be used.
It is an example for what is possible and could be considered...

createToken({ name: 'MIN_MAX_KEYWORD', pattern: Lexer.NA });
createToken({ name: 'MINLENGTH', pattern: 'minlength', parent: tokens.MIN_MAX_KEYWORD });
createToken({ name: 'MAXLENGTH', pattern: 'maxlength', parent: tokens.MIN_MAX_KEYWORD });
createToken({ name: 'MINBYTES', pattern: 'minbytes', parent: tokens.MIN_MAX_KEYWORD });
createToken({ name: 'MAXBYTES', pattern: 'maxbytes', parent: tokens.MIN_MAX_KEYWORD });
createToken({ name: 'MAX', pattern: 'max', parent: tokens.MIN_MAX_KEYWORD });
createToken({ name: 'MIN', pattern: 'min', parent: tokens.MIN_MAX_KEYWORD });

createToken({ name: 'PATTERN', pattern: 'pattern' });

createToken({ name: 'REGEX', pattern: /\/[^\n\r/]*\// });
createToken({ name: 'INTEGER', pattern: /-?\d+/ });

// All the names tokens of the pegjs implementation have been merged into
// a single token type. This is because they would cause ambiguities
// when the lexing stage is separated from the parsing stage.
// They restrictions on the names should be implemented as semantic checks
// That approach could also provide a better experience in an Editor
// As semantic checks don't require fault tolerance and recovery like
// syntax errors do.
// TODO: looks like the parenthesis should not be part of the name, but a suffix, eg: "maxlength(25)"
// TODO: because if it is part of the name than this is also valid "max))((Length()1)))"...
createToken({ name: 'NAME', pattern: /[a-zA-Z_][a-zA-Z_\d()]*/ });

// punctuation
createToken({ name: 'LPAREN', pattern: '(' });
createToken({ name: 'RPAREN', pattern: ')' });
createToken({ name: 'LCURLY', pattern: '{' });
createToken({ name: 'RCURLY', pattern: '}' });
createToken({ name: 'LSQUARE', pattern: '[' });
createToken({ name: 'RSQUARE', pattern: ']' });
createToken({ name: 'COMMA', pattern: ',' });
createToken({ name: 'COLON', pattern: ':' });
createToken({ name: 'EQUALS', pattern: '=' });
createToken({ name: 'DOT', pattern: '.' });

// TODO: The debug flag should not be enabled in productive envs for performance reasons.
// It is useful to help debug the token vector results.
const JDLLexer = new Lexer(_.values(tokens), { debug: true });

// short prefix to reduce verbosity.
const t = tokens;

class JDLParser extends Parser {
constructor(input) {
super(input, tokens);

const $ = this;

// HIGHLIGHTS1: Any rule may be used as a start rule, there is no artificial limit
// like in pegjs. This capability is useful for partial parsing, e.g.:
// 1. Code snippets
// 2. Incremental parsing of only the changed parts of an active Editor.
// 3. writing Unit tests for micro code samples.
$.RULE('prog', () => {
$.OR([
{ ALT: () => { $.SUBRULE($.constantDecl); } },
{ ALT: () => { $.SUBRULE($.entityDecl); } }
]);
});

$.RULE('constantDecl', () => {
$.CONSUME(t.NAME);
$.CONSUME(t.EQUALS);
$.CONSUME(t.INTEGER);
});

$.RULE('entityDecl', () => {
$.CONSUME(t.ENTITY);
$.CONSUME(t.NAME);

$.OPTION(() => {
$.SUBRULE($.entityTableNameDecl);
});

// the "2" suffix is a quirk of Chevrotain, more details:
// https://github.com/SAP/chevrotain/blob/master/docs/faq.md#-why-are-the-unique-numerical-suffixes-consume1consume2-needed-for-the-dsl-rules
$.OPTION2(() => {
$.SUBRULE($.entityBody);
});
});

$.RULE('entityTableNameDecl', () => {
$.CONSUME(t.LPAREN);
$.CONSUME(t.NAME);
$.CONSUME(t.RPAREN);
});

$.RULE('entityBody', () => {
$.CONSUME(t.LCURLY);
$.AT_LEAST_ONE(() => {
$.SUBRULE($.fieldDec);
});
$.CONSUME(t.RCURLY);
});

$.RULE('fieldDec', () => {
$.CONSUME(t.NAME);
$.SUBRULE($.type);
// Short form for: "(X(,X)*)?"
$.MANY_SEP({
SEP: t.COMMA,
DEF: () => {
$.SUBRULE($.validation);
}
});
$.CONSUME(t.RCURLY);
});

$.RULE('type', () => {
$.CONSUME($.NAME);
});

$.RULE('validation', () => {
$.OR([
{ ALT: () => { $.CONSUME(t.REQUIRED); } },
{ ALT: () => { $.SUBRULE($.minMaxValidation); } },
{ ALT: () => { $.SUBRULE($.pattern); } }
]);
});

$.RULE('minMaxValidation', () => {
// HIGHLIGHT:
// Note that "MIN_MAX_KEYWORD" is an abstract token and could match 6 different concrete token types
$.CONSUME(t.MIN_MAX_KEYWORD);
$.CONSUME(t.LPAREN);
$.OR([
{ ALT: () => { $.CONSUME(t.INTEGER); } },
{ ALT: () => { $.CONSUME(t.NAME); } }
]);
$.CONSUME(t.RPAREN);
});

$.RULE('pattern', () => {
$.CONSUME(t.PATTERN);
$.CONSUME(t.LPAREN);
// HIGHLIGHT:
// With Chevrotain the grammar can be debugged directly by using good old fashioned breakpoints.
// No need to to try and figure out a 10,000 lines generated file, or worse not even have that
// if we would be use some JS combinator.
// debugger;
$.CONSUME(t.REGEX);
$.CONSUME(t.RPAREN);
});

// very important to call this after all the rules have been defined.
// otherwise the parser may not work correctly as it will lack information
// derived during the self analysis phase.
Parser.performSelfAnalysis(this);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is during the object's construction-time. All this. Why not having another way?
One file is enough, but putting everything in the constructor isn't really something I look forward to maintaining, even if the improvement of using using Chevrotain over PegJS is obvious. Why not, for instance, use a factory of some sort (a function that calls other functions to build the parser instance)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not having another way?

Answer

The syntax I prefer relies on using class fields ESNext syntax.
https://github.com/tc39/proposal-class-fields
But this is not yet supported afaik (currently stage 3 proposal).
I suppose Babel will support this at some point:
babel/proposals#12

TypeScript has something similar which already works now.
See this example:
This is similar the "official" API I'm aiming for, but may need to wait for ES2018 for that. :(

Alternative

Anyhow as it is all just plain JavaScript you can define it (mostly) however you want...
An extreme example would be this completely different DSL for specifying Chevrotain grammars
https://github.com/kristianmandrup/chevrotain-rule-dsl

I'm am a bit too tired to think a concrete alternative syntax right now.
But I believe one should be possible even with ES6, perhaps you have a suggestion?
The constraints are:

  1. Parser.performSelfAnalysis must be called after the rules have been defined.

    • As it relies on side effects of creating the rules.
  2. The RULE calls must be called in the context of the parser instance (this).

And if it helps normally you only use a single parser instance and reset it's internal state before each use.

Future / Long term.

There is also an open issue for better support of custom APIs for building Chevrotain parsers.
And I'm hoping in the long term to support three different API styles (same as Mocha/Chai have different APIs using the same underlying engine).

  1. Low Level Hand-Built style.
  2. Combinator Style, fluent DSL.
  3. EBNF generator style (Like pegjs).

Copy link
Contributor Author

@bd82 bd82 Aug 29, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is a really quick and dirty factory style hack.
https://github.com/SAP/chevrotain/blob/5235a12da1818aaf2ac075cd4326d46e46da15fc/examples/grammars/json/json.js#L95-L126

And here are the rules defined outside the constructor.
https://github.com/SAP/chevrotain/blob/5235a12da1818aaf2ac075cd4326d46e46da15fc/examples/grammars/json/json.js#L129-L180

I don't think this should be part of Chevrotain's official API
as I would rather wait for class fields proposal, but it can be cleaned up and reused
by end users if needed...

Also note that this factory mixes in the rules, so they could easily be split up
to multiple files for large grammars.

Hope this example demonstrates how due to Chevrotain being a library
instead of a code generator makes it much more malleable to customization. 😄

}
}

module.exports = {
tokens,
JDLLexer,
JDLParser
};
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
}
],
"dependencies": {
"chevrotain": "^0.32.1",
"lodash": "4.17.4",
"winston": "2.3.1"
},
Expand Down
65 changes: 65 additions & 0 deletions test/spec/grammar/lexer_test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/* eslint-disable no-new, no-unused-expressions */
const expect = require('chai').expect;
const lexerModule = require('../../../lib/dsl/chev_grammar');

const JDLLexer = lexerModule.JDLLexer;

describe('Chevrotain Lexer POC', () => {
it('Can lex a simple valid JDL text', () => {
const input = `
/**
* outer comment.
*/
entity JobHistory {
startDate ZonedDateTime,
/* inner comment */
endDate ZonedDateTime,
language Language
}`;

const lexResult = JDLLexer.tokenize(input);
expect(lexResult.errors).to.be.empty;

const comments = lexResult.groups.comments;
expect(comments[0].image).to.include('outer comment.');
expect(comments[0].startLine).to.equal(2);
expect(comments[0].endLine).to.equal(4);
expect(comments[1].image).to.include('inner comment');
expect(comments[1].startLine).to.equal(7);
expect(comments[1].endLine).to.equal(7);

const tokens = lexResult.tokens;
expect(tokens.length).to.equal(12);
expect(tokens[0].image).to.equal('entity');
expect(tokens[1].image).to.equal('JobHistory');
expect(tokens[2].image).to.equal('{');
expect(tokens[3].image).to.equal('startDate');
expect(tokens[4].image).to.equal('ZonedDateTime');
expect(tokens[5].image).to.equal(',');
expect(tokens[6].image).to.equal('endDate');
expect(tokens[7].image).to.equal('ZonedDateTime');
expect(tokens[8].image).to.equal(',');
expect(tokens[9].image).to.equal('language');
expect(tokens[10].image).to.equal('Language');
expect(tokens[11].image).to.equal('}');
});

it('Can lex a simple IN-valid JDL text', () => {
const input = `
entity JobHistory {
startDate ZonedDateTime,
@@@ /* invalid token but the lexing should continue */
endDate ZonedDateTime
}`;
const lexResult = JDLLexer.tokenize(input);
const errors = lexResult.errors;
expect(errors).to.have.lengthOf(1);
expect(errors[0].line).to.equal(4);
expect(errors[0].column).to.equal(6);
expect(errors[0].message).to.include('@');
expect(errors[0].message).to.include('skipped 3 characters');

expect(lexResult.tokens).to.have.lengthOf(9,
'All 9 tokens should have been lexed even thought "@@@" caused a syntax error');
});
});
4 changes: 4 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ check-error@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/check-error/-/check-error-1.0.2.tgz#574d312edd88bb5dd8912e9286dd6c0aed4aac82"

chevrotain@^0.32.1:
version "0.32.1"
resolved "https://registry.yarnpkg.com/chevrotain/-/chevrotain-0.32.1.tgz#8815f85c3f1c01bbfea225b689a73c1e7cea0b8b"

circular-json@^0.3.1:
version "0.3.3"
resolved "https://registry.yarnpkg.com/circular-json/-/circular-json-0.3.3.tgz#815c99ea84f6809529d2f45791bdf82711352d66"
Expand Down