jhipster · MathieuAA · Sep 23, 2017 · Aug 28, 2017 · Aug 29, 2017 · Aug 30, 2017
diff --git a/lib/dsl/chev_grammar.js b/lib/dsl/chev_grammar.js
@@ -0,0 +1,265 @@
+const chevrotain = require('chevrotain');
+const _ = require('lodash');
+
+
+const Lexer = chevrotain.Lexer;
+const Parser = chevrotain.Parser;
+
+// ----------------- lexer -----------------
+const tokens = {};
+
+function createToken(config) {
+  tokens[config.name] = chevrotain.createToken(config);
+}
+
+createToken({
+  name: 'WhiteSpace',
+  // TODO: uncertain why the original grammar disallowed newlines
+  // TODO: in rules: "fieldDeclList: "and validationList"
+  pattern: /\s+/,
+  // HIGHLIGHT:
+  // This special "group" causes the lexer to completely ignore
+  // These tokens, In partical terms this means the parser
+  // Does not have need to have hundreds of "SPACE*" everywhere (unlike pegjs).
+  // With a single line we can make our language whitespace insensitive.
+  group: Lexer.SKIPPED,
+  line_breaks: true
+});
+
+// Comments
+createToken({
+  name: 'Comment',
+  pattern: /\/\*[^]*?\*\//,
+  // HIGHLIGHT:
+  // By using the "group" option, the comments
+  // will be collected into a separate array property
+  // This means comments can do both (unlike pegjs):
+  // 1. Appear anywhere.
+  // 2. can be completely ignored when implementing the grammar.
+  group: 'comments',
+  line_breaks: true
+});
+
+// Constants
+// Application constants
+createToken({ name: 'APPLICATION', pattern: 'application' });
+createToken({ name: 'BASE_NAME', pattern: 'baseName' });
+createToken({ name: 'PATH', pattern: 'path' });
+createToken({ name: 'PACKAGE_NAME', pattern: 'packageName' });
+createToken({ name: 'AUTHENTICATION_TYPE', pattern: 'authenticationType' });
+createToken({ name: 'HIBERNATE_CACHE', pattern: 'hibernateCache' });
+createToken({ name: 'CLUSTERED_HTTP_SESSION', pattern: 'clusteredHttpSession' });
+createToken({ name: 'WEBSOCKET', pattern: 'websocket' });
+createToken({ name: 'DATABASE_TYPE', pattern: 'databaseType' });
+createToken({ name: 'DEV_DATABASE_TYPE', pattern: 'devDatabaseType' });
+createToken({ name: 'PROD_DATABASE_TYPE', pattern: 'prodDatabaseType' });
+createToken({ name: 'USE_COMPASS', pattern: 'useCompass' });
+createToken({ name: 'BUILD_TOOL', pattern: 'buildTool' });
+createToken({ name: 'SEARCH_ENGINE', pattern: 'searchEngine' });
+createToken({ name: 'ENABLE_TRANSLATION', pattern: 'enableTranslation' });
+createToken({ name: 'APPLICATION_TYPE', pattern: 'applicationType' });
+createToken({ name: 'TEST_FRAMEWORK', pattern: 'testFrameworks' });
+createToken({ name: 'LANGUAGES', pattern: 'languages' });
+createToken({ name: 'SERVER_PORT', pattern: 'serverPort' });
+createToken({ name: 'ENABLE_SOCIAL_SIGN_IN', pattern: 'enableSocialSignIn' });
+createToken({ name: 'USE_SASS', pattern: 'useSass' });
+createToken({ name: 'JHI_PREFIX', pattern: 'jhiPrefix' });
+createToken({ name: 'MESSAGE_BROKER', pattern: 'messageBroker' });
+createToken({ name: 'SERVICE_DISCOVERY_TYPE', pattern: 'serviceDiscoveryType' });
+createToken({ name: 'CLIENT_PACKAGE_MANAGER', pattern: 'clientPackageManager' });
+createToken({ name: 'CLIENT_FRAMEWORK', pattern: 'clientFramework' });
+createToken({ name: 'NATIVE_LANGUAGE', pattern: 'nativeLanguage' });
+createToken({ name: 'FRONT_END_BUILDER', pattern: 'frontendBuilder' });
+createToken({ name: 'SKIP_USER_MANAGEMENT', pattern: 'skipUserManagement' });
+// skipClient & skipServer are already defined
+createToken({ name: 'TRUE', pattern: 'true' });
+createToken({ name: 'FALSE', pattern: 'false' });
+// Entity constants
+createToken({ name: 'ENTITY', pattern: 'entity' });
+createToken({ name: 'RELATIONSHIP', pattern: 'relationship' });
+createToken({ name: 'ENUM', pattern: 'enum' });
+// Relationship types
+createToken({ name: 'ONE_TO_ONE', pattern: 'OneToOne' });
+createToken({ name: 'ONE_TO_MANY', pattern: 'OneToMany' });
+createToken({ name: 'MANY_TO_ONE', pattern: 'ManyToOne' });
+createToken({ name: 'MANY_TO_MANY', pattern: 'ManyToMany' });
+
+// Options
+createToken({ name: 'ALL', pattern: 'all' });
+createToken({ name: 'STAR', pattern: '*' });
+createToken({ name: 'FOR', pattern: 'for' });
+createToken({ name: 'WITH', pattern: 'with' });
+createToken({ name: 'EXCEPT', pattern: 'except' });
+createToken({ name: 'NO_FLUENT_METHOD', pattern: 'noFluentMethod' });
+createToken({ name: 'DTO', pattern: 'dto' });
+createToken({ name: 'PAGINATE', pattern: 'paginate' });
+createToken({ name: 'SERVICE', pattern: 'service' });
+createToken({ name: 'MICROSERVICE', pattern: 'microservice' });
+createToken({ name: 'SEARCH', pattern: 'search' });
+createToken({ name: 'SKIP_CLIENT', pattern: 'skipClient' });
+createToken({ name: 'SKIP_SERVER', pattern: 'skipServer' });
+createToken({ name: 'ANGULAR_SUFFIX', pattern: 'angularSuffix' });
+
+// validations
+createToken({ name: 'REQUIRED', pattern: 'required' });
+
+// HIGHLIGHT:
+// "MIN_MAX_KEYWORD" is an "abstract" token which other concrete tokens inherit from.
+// This can be used to reduce verbosity in the parser.
 / MINLENGTH SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'minlength', value: int }; } 
 / MINLENGTH SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'minlength', value: constantName, constant: true }; } 
 / MAXLENGTH SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'maxlength', value: int }; } 
 / MAXLENGTH SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'maxlength', value: constantName, constant: true }; } 
 / MINBYTES SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'minbytes', value: int }; } 
 / MINBYTES SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'minbytes', value: constantName, constant: true }; } 
 / MAXBYTES SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'maxbytes', value: int }; } 
 / MAXBYTES SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'maxbytes', value: constantName, constant: true }; } 
 / MIN SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'min', value: int };} 
 / MIN SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'min', value: constantName, constant: true }; } 
 / MAX SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'max', value: int };} 
 / MAX SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'max', value: constantName, constant: true }; } 
 / MINLENGTH SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'minlength', value: int }; } 
 / MINLENGTH SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'minlength', value: constantName, constant: true }; } 
 / MAXLENGTH SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'maxlength', value: int }; } 
 / MAXLENGTH SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'maxlength', value: constantName, constant: true }; } 
 / MINBYTES SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'minbytes', value: int }; } 
 / MINBYTES SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'minbytes', value: constantName, constant: true }; } 
 / MAXBYTES SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'maxbytes', value: int }; } 
 / MAXBYTES SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'maxbytes', value: constantName, constant: true }; } 
 / MIN SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'min', value: int };} 
 / MIN SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'min', value: constantName, constant: true }; } 
 / MAX SPACE* '(' SPACE* int:INTEGER SPACE* ')' { return { key: 'max', value: int };} 
 / MAX SPACE* '(' SPACE* constantName:CONSTANT_NAME SPACE* ')' { return { key: 'max', value: constantName, constant: true }; } 
+createToken({ name: 'MIN_MAX_KEYWORD', pattern: Lexer.NA });
+createToken({ name: 'MINLENGTH', pattern: 'minlength', parent: tokens.MIN_MAX_KEYWORD });
+createToken({ name: 'MAXLENGTH', pattern: 'maxlength', parent: tokens.MIN_MAX_KEYWORD });
+createToken({ name: 'MINBYTES', pattern: 'minbytes', parent: tokens.MIN_MAX_KEYWORD });
+createToken({ name: 'MAXBYTES', pattern: 'maxbytes', parent: tokens.MIN_MAX_KEYWORD });
+createToken({ name: 'MAX', pattern: 'max', parent: tokens.MIN_MAX_KEYWORD });
+createToken({ name: 'MIN', pattern: 'min', parent: tokens.MIN_MAX_KEYWORD });
+
+createToken({ name: 'PATTERN', pattern: 'pattern' });
+
+createToken({ name: 'REGEX', pattern: /\/[^\n\r/]*\// });
+createToken({ name: 'INTEGER', pattern: /-?\d+/ });
+
+// All the names tokens of the pegjs implementation have been merged into
+// a single token type. This is because they would cause ambiguities
+// when the lexing stage is separated from the parsing stage.
+// They restrictions on the names should be implemented as semantic checks
+// That approach could also provide a better experience in an Editor
+// As semantic checks don't require fault tolerance and recovery like
+// syntax errors do.
+// TODO: looks like the parenthesis should not be part of the name, but a suffix, eg: "maxlength(25)"
+// TODO: because if it is part of the name than this is also valid "max))((Length()1)))"...
+createToken({ name: 'NAME', pattern: /[a-zA-Z_][a-zA-Z_\d()]*/ });
+
+// punctuation
+createToken({ name: 'LPAREN', pattern: '(' });
+createToken({ name: 'RPAREN', pattern: ')' });
+createToken({ name: 'LCURLY', pattern: '{' });
+createToken({ name: 'RCURLY', pattern: '}' });
+createToken({ name: 'LSQUARE', pattern: '[' });
+createToken({ name: 'RSQUARE', pattern: ']' });
+createToken({ name: 'COMMA', pattern: ',' });
+createToken({ name: 'COLON', pattern: ':' });
+createToken({ name: 'EQUALS', pattern: '=' });
+createToken({ name: 'DOT', pattern: '.' });
+
+// TODO: The debug flag should not be enabled in productive envs for performance reasons.
+// It is useful to help debug the token vector results.
+const JDLLexer = new Lexer(_.values(tokens), { debug: true });
+
+// short prefix to reduce verbosity.
+const t = tokens;
+
+class JDLParser extends Parser {
+  constructor(input) {
+    super(input, tokens);
+
+    const $ = this;
+
+    // HIGHLIGHTS1: Any rule may be used as a start rule, there is no artificial limit
+    // like in pegjs. This capability is useful for partial parsing, e.g.:
+    // 1. Code snippets
+    // 2. Incremental parsing of only the changed parts of an active Editor.
+    // 3. writing Unit tests for micro code samples.
+    $.RULE('prog', () => {
+      $.OR([
+        { ALT: () => { $.SUBRULE($.constantDecl); } },
+        { ALT: () => { $.SUBRULE($.entityDecl); } }
+      ]);
+    });
+
+    $.RULE('constantDecl', () => {
+      $.CONSUME(t.NAME);
+      $.CONSUME(t.EQUALS);
+      $.CONSUME(t.INTEGER);
+    });
+
+    $.RULE('entityDecl', () => {
+      $.CONSUME(t.ENTITY);
+      $.CONSUME(t.NAME);
+
+      $.OPTION(() => {
+        $.SUBRULE($.entityTableNameDecl);
+      });
+
+      // the "2" suffix is a quirk of Chevrotain, more details:
+      // https://github.com/SAP/chevrotain/blob/master/docs/faq.md#-why-are-the-unique-numerical-suffixes-consume1consume2-needed-for-the-dsl-rules
+      $.OPTION2(() => {
+        $.SUBRULE($.entityBody);
+      });
+    });
+
+    $.RULE('entityTableNameDecl', () => {
+      $.CONSUME(t.LPAREN);
+      $.CONSUME(t.NAME);
+      $.CONSUME(t.RPAREN);
+    });
+
+    $.RULE('entityBody', () => {
+      $.CONSUME(t.LCURLY);
+      $.AT_LEAST_ONE(() => {
+        $.SUBRULE($.fieldDec);
+      });
+      $.CONSUME(t.RCURLY);
+    });
+
+    $.RULE('fieldDec', () => {
+      $.CONSUME(t.NAME);
+      $.SUBRULE($.type);
+      // Short form for: "(X(,X)*)?"
+      $.MANY_SEP({
+        SEP: t.COMMA,
+        DEF: () => {
+          $.SUBRULE($.validation);
+        }
+      });
+      $.CONSUME(t.RCURLY);
+    });
+
+    $.RULE('type', () => {
+      $.CONSUME($.NAME);
+    });
+
+    $.RULE('validation', () => {
+      $.OR([
+        { ALT: () => { $.CONSUME(t.REQUIRED); } },
+        { ALT: () => { $.SUBRULE($.minMaxValidation); } },
+        { ALT: () => { $.SUBRULE($.pattern); } }
+      ]);
+    });
+
+    $.RULE('minMaxValidation', () => {
+      // HIGHLIGHT:
+      // Note that "MIN_MAX_KEYWORD" is an abstract token and could match 6 different concrete token types
+      $.CONSUME(t.MIN_MAX_KEYWORD);
+      $.CONSUME(t.LPAREN);
+      $.OR([
+        { ALT: () => { $.CONSUME(t.INTEGER); } },
+        { ALT: () => { $.CONSUME(t.NAME); } }
+      ]);
+      $.CONSUME(t.RPAREN);
+    });
+
+    $.RULE('pattern', () => {
+      $.CONSUME(t.PATTERN);
+      $.CONSUME(t.LPAREN);
+      // HIGHLIGHT:
+      // With Chevrotain the grammar can be debugged directly by using good old fashioned breakpoints.
+      // No need to to try and figure out a 10,000 lines generated file, or worse not even have that
+      // if we would be use some JS combinator.
+      // debugger; 
+      $.CONSUME(t.REGEX);
+      $.CONSUME(t.RPAREN);
+    });
+
+    // very important to call this after all the rules have been defined.
+    // otherwise the parser may not work correctly as it will lack information
+    // derived during the self analysis phase.
+    Parser.performSelfAnalysis(this);
+  }
+}
+
+module.exports = {
+  tokens,
+  JDLLexer,
+  JDLParser
+};
diff --git a/package.json b/package.json
@@ -42,6 +42,7 @@
     }
   ],
   "dependencies": {
+    "chevrotain": "^0.32.1",
     "lodash": "4.17.4",
     "winston": "2.3.1"
   },

diff --git a/test/spec/grammar/lexer_test.js b/test/spec/grammar/lexer_test.js
@@ -0,0 +1,65 @@
+/* eslint-disable no-new, no-unused-expressions */
+const expect = require('chai').expect;
+const lexerModule = require('../../../lib/dsl/chev_grammar');
+
+const JDLLexer = lexerModule.JDLLexer;
+
+describe('Chevrotain Lexer POC', () => {
+  it('Can lex a simple valid JDL text', () => {
+    const input = `
+   /**
+   * outer comment.
+   */
+   entity JobHistory {
+     startDate ZonedDateTime,
+     /* inner comment */
+     endDate ZonedDateTime,
+     language Language
+   }`;
+
+    const lexResult = JDLLexer.tokenize(input);
+    expect(lexResult.errors).to.be.empty;
+
+    const comments = lexResult.groups.comments;
+    expect(comments[0].image).to.include('outer comment.');
+    expect(comments[0].startLine).to.equal(2);
+    expect(comments[0].endLine).to.equal(4);
+    expect(comments[1].image).to.include('inner comment');
+    expect(comments[1].startLine).to.equal(7);
+    expect(comments[1].endLine).to.equal(7);
+
+    const tokens = lexResult.tokens;
+    expect(tokens.length).to.equal(12);
+    expect(tokens[0].image).to.equal('entity');
+    expect(tokens[1].image).to.equal('JobHistory');
+    expect(tokens[2].image).to.equal('{');
+    expect(tokens[3].image).to.equal('startDate');
+    expect(tokens[4].image).to.equal('ZonedDateTime');
+    expect(tokens[5].image).to.equal(',');
+    expect(tokens[6].image).to.equal('endDate');
+    expect(tokens[7].image).to.equal('ZonedDateTime');
+    expect(tokens[8].image).to.equal(',');
+    expect(tokens[9].image).to.equal('language');
+    expect(tokens[10].image).to.equal('Language');
+    expect(tokens[11].image).to.equal('}');
+  });
+
+  it('Can lex a simple IN-valid JDL text', () => {
+    const input = `
+   entity JobHistory {
+     startDate ZonedDateTime,
+     @@@ /* invalid token but the lexing should continue */
+     endDate ZonedDateTime
+   }`;
+    const lexResult = JDLLexer.tokenize(input);
+    const errors = lexResult.errors;
+    expect(errors).to.have.lengthOf(1);
+    expect(errors[0].line).to.equal(4);
+    expect(errors[0].column).to.equal(6);
+    expect(errors[0].message).to.include('@');
+    expect(errors[0].message).to.include('skipped 3 characters');
+
+    expect(lexResult.tokens).to.have.lengthOf(9,
+      'All 9 tokens should have been lexed even thought "@@@" caused a syntax error');
+  });
+});
diff --git a/yarn.lock b/yarn.lock
@@ -187,6 +187,10 @@ check-error@^1.0.1:
   version "1.0.2"
   resolved "https://registry.yarnpkg.com/check-error/-/check-error-1.0.2.tgz#574d312edd88bb5dd8912e9286dd6c0aed4aac82"
 
+chevrotain@^0.32.1:
+  version "0.32.1"
+  resolved "https://registry.yarnpkg.com/chevrotain/-/chevrotain-0.32.1.tgz#8815f85c3f1c01bbfea225b689a73c1e7cea0b8b"
+
 circular-json@^0.3.1:
   version "0.3.3"
   resolved "https://registry.yarnpkg.com/circular-json/-/circular-json-0.3.3.tgz#815c99ea84f6809529d2f45791bdf82711352d66"