diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..92ad407 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.bat text eol=crlf diff --git a/.github/workflows/gradle.yml b/.github/workflows/gradle.yml new file mode 100644 index 0000000..f9dc544 --- /dev/null +++ b/.github/workflows/gradle.yml @@ -0,0 +1,25 @@ +name: Java CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 17 + uses: actions/setup-java@v2 + with: + java-version: "17" + distribution: "temurin" + - name: Validate Gradle wrapper + uses: gradle/wrapper-validation-action@e6e38bacfdf1a337459f332974bb2327a31aaf4b + - name: Build + run: ./gradlew build + - name: Test + run: ./gradlew test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b357907 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.gradle/ +.idea/ +build/ +bin/ +html/ diff --git a/COPYRIGHT.md b/COPYRIGHT.md new file mode 100644 index 0000000..7311994 --- /dev/null +++ b/COPYRIGHT.md @@ -0,0 +1,23 @@ +Copyright (c) 2022 Maroontress Fast Software. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS *AS IS* AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..c72ef8a --- /dev/null +++ b/README.md @@ -0,0 +1,557 @@ +# Clione + +Clione is a Java implementation of a lexical parser that tokenizes source code +written in C17 and other C-like programming languages. + +The main facility is a tokenization API corresponding to the C preprocessor +layer. It includes the features of trigraph replacement, line splicing, and +tokenization but does not include macro expansion and directive handling. + +## Example + +[A typical usage example](src/test/java/com/example/TokenDemo.java) would be as +follows: + +```java +package com.example; + +import java.io.IOException; +import java.nio.file.FileSystems; +import java.nio.file.Files; + +import com.maroontress.clione.LexicalParser; +import com.maroontress.clione.Token; + +public final class TokenDemo { + + public static void main(String[] args) { + var path = FileSystems.getDefault().getPath(args[0]); + try (var parser = LexicalParser.of(Files.newBufferedReader(path))) { + run(parser); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void run(LexicalParser parser) throws IOException { + for (;;) { + var maybeToken = parser.next(); + if (maybeToken.isEmpty()) { + break; + } + var token = maybeToken.get(); + printToken(token, ""); + } + } + + public static void printToken(Token token, String indent) { + var type = token.getType(); + var value = token.getValue(); + var span = token.getSpan(); + var s = switch (type) { + case DELIMITER, DIRECTIVE_END + -> "'" + value.replaceAll("\n", "\\\\n") + "'"; + default -> value; + }; + System.out.printf("%s%s: %s: %s%n", indent, span, type, s); + for (var child : token.getChildren()) { + printToken(child, indent + "| "); + } + } +} +``` + +And [`helloworld.c`](src/test/resources/com/example/helloworld.c) would be as +follows: + +```c +#include + +int main(void) +{ + printf("hello world\n"); +} +``` + +In this example, the result of "`java com.example.TokenDemo helloworld.c`" is +as follows: + +```plaintext +L1:1--19: DIRECTIVE: # +| L1:2--8: DIRECTIVE_NAME: include +| L1:9: DELIMITER: ' ' +| L1:10--18: STANDARD_HEADER: +| L1:19: DIRECTIVE_END: '\n' +L2:1: DELIMITER: '\n' +L3:1--3: RESERVED: int +L3:4: DELIMITER: ' ' +L3:5--8: IDENTIFIER: main +L3:9: PUNCTUATOR: ( +L3:10--13: RESERVED: void +L3:14: PUNCTUATOR: ) +L3:15: DELIMITER: '\n' +L4:1: PUNCTUATOR: { +L4:2--L5:4: DELIMITER: '\n ' +L5:5--10: IDENTIFIER: printf +L5:11: PUNCTUATOR: ( +L5:12--26: STRING: "hello world\n" +L5:27: PUNCTUATOR: ) +L5:28: PUNCTUATOR: ; +L5:29: DELIMITER: '\n' +L6:1: PUNCTUATOR: } +L6:2: DELIMITER: '\n' +``` + +## Tokens + +The `LexicalParser` object creates and returns a token from the stream of the +source file. It often extracts the ones from the source file, but trigraph and +digraph substitution and line concatenation may result in tokens that are not +in the source file. It returns an empty token when it finally reaches the end +of the source file. + +The `Token` objects that the `next()` method of `LexicalParser` instance +returns are the preprocessing tokens. So, the evaluation is necessary before +using their content. In other words, they can be incomplete according to the +token type. For example, the string literal or comment may not terminate, the +preprocessing number may not represent valid integer and floating-point +constants, and so on. + +As in the example above, `Token` objects can have children, which means they +can be in a tree structure. For tokens that the `next()` method returns, tokens +of type `TokenType.DIRECTIVE` only have children. + +The `Token` object has its type, span, and characters. The type is one of the +constants defined in `enum TokenType`, the span represents the range of the +source file where the token occurs, and the characters are `SourceChar` objects +that compose it. + +## Characters + +The `SourceChar` object represents a character that composes the token or EOF. +It may also have one or more child characters in some cases. For example, it is +the case that it represents: + +- the character which is substituted for any digraph or trigraph sequence +- the character that follows a backslash (`\`) at the end of the line + +[The following code](src/test/java/com/example/SourceCharDemo.java) shows an +example: + +```java +package com.example; + +import java.io.IOException; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.util.List; + +import com.maroontress.clione.LexicalParser; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.Token; + +public final class SourceCharDemo { + + public static void main(String[] args) { + var path = FileSystems.getDefault().getPath(args[0]); + try (var parser = LexicalParser.of(Files.newBufferedReader(path))) { + run(parser); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void run(LexicalParser parser) throws IOException { + for (;;) { + var maybeToken = parser.next(); + if (maybeToken.isEmpty()) { + break; + } + printToken(maybeToken.get()); + } + } + + public static void printToken(Token token) { + var type = token.getType(); + var value = token.getValue(); + var span = token.getSpan(); + var s = switch (type) { + case DELIMITER, DIRECTIVE_END + -> "'" + value.replaceAll("\n", "\\\\n") + "'"; + default -> value; + }; + System.out.printf("%s: %s: %s%n", span, type, s); + printChars(token.getChars(), " "); + } + + private static void printChars(List chars, String indent) { + for (var c : chars) { + var span = c.getSpan(); + var value = c.toChar(); + var s = (value == '\n') + ? "'\\n'" + : Character.isHighSurrogate(value) + ? "H(0x" + Integer.toString((int) value, 16) + ")" + : Character.isLowSurrogate(value) + ? "L(0x" + Integer.toString((int) value, 16) + ")" + : String.valueOf(value); + System.out.printf("%s%s: %s%n", indent, span, s); + printChars(c.getChildren(), indent + "| "); + } + } +} +``` + +And +[`main.c`](src/test/resources/com/example/main.c) would be as follows: + +```c +ma??/ +in +``` + +In this example, the result of "`java com.example.SourceCharDemo main.c`" is as follows: + +```plaintext +L1:1--L2:2: IDENTIFIER: main + L1:1: m + L1:2: a + L1:3--L2:1: i + | L1:3--5: \ + | | L1:3: ? + | | L1:4: ? + | | L1:5: / + | L1:6: '\n' + | L2:1: i + L2:2: n +โ‹ฎ +``` + +The result illustrates that the character `i` in the identifier `main` has +child characters: a backslash (`\`), a newline (`\n`), and `i`. Furthermore, +the backslash character has child characters: `?`, `?`, and `/`. Of course, +what happens is that the trigraph sequence `??/` is replaced with a backslash +at first, and then the backslash at the end of the line results in the line +concatenation. + +## Surrogate pairs + +A character corresponds to a column. So, one `char` value often represents one +column. However, in the case of a character represented with a surrogate pair, +the two `char` values in the pair represent one column. Here is an example +[`emojicat.c`](src/test/resources/com/example/emojicat.c): + +```c +char *cat = u8"๐Ÿฑ"; +``` + +The result of "`java com.example.SourceCharDemo emojicat.c`" is as follows: + +```plaintext +โ‹ฎ +L1:19--23: STRING: u8"๐Ÿฑ" + L1:13: u + L1:14: 8 + L1:15: " + L1:16: H(0xd83d) + L1:16: L(0xdc31) + L1:17: " +โ‹ฎ +``` + +This example shows that the high and low surrogate characters are in the same +column. + +## Phases of translation + +The lexical parser starts tokenization after trigraph replacement and line +splicing, according to the +[_phases of translation_][wikipedia-phases-of-translation]. + +### Newlines + +Before anything else, the lexical parser substitutes `\n` for all newlines, +that is, line feed (LF), carriage return and line feed (CRLF), and carriage +return (CR) in the stream, even if different newlines are mixed in the stream. +It indicates `\n` as a newline (NL) character, regardless of platform. + +### Trigraphs + +After unifying newline characters, the lexical parser replaces +[trigraph sequences][wikipedia-trigraph] with the new `SourceChar` objects they +represent. The new one becomes the parent of the replaced characters and +represents their equivalent. The following table lists all trigraphs: + +| Trigraph | Equivalent | +| :---: | :---: | +| `??<` | `{` | +| `??>` | `}` | +| `??[` | `(` | +| `??]` | `)` | +| `??=` | `#` | +| `??/` | `\` | +| `??'` | `^` | +| `??!` | `\|` | +| `??-` | `~` | + +### Line splicing + +Next to the trigraph replacement, the lexical parser removes the backslash +character at the end of the line. To be more precise, it replaces the +backslash, the newline character, and the next character with a new +`SourceChar` object. The new one becomes the parent of the replaced characters +and represents the character that followed the backslash and newline +characters. + +A pair of the backslash and newline characters may appear two or more times +with consecutive occurrences. In that case, the new substituted one becomes the +parent of both their characters and the next character. + +### Tokenization + +After line splicing, the lexical parser starts to break the `SourceChar` stream +into `Token`s. A `Token` object may be either: + +- delimiters (that are sequences of whitespace characters) +- comments +- directives +- preprocessing tokens (that are standard header names, identifiers, + preprocessing numbers, character constants, string literals, operators and + punctuators, or unknown token) + +## Delimiters + +A delimiter is a separator between tokens. Strictly speaking, it is not a +token, but the lexical parser returns the delimiter as a token. Some +applications may completely ignore delimiters (for example, code formatters). + +The space, horizontal tab (HT), form feed (FF), vertical tab (VT), and NL +characters are delimiters within any non-directive line. The space and HT +characters are delimiters within any directive lines. + +> โ˜• By the way, have you seen source code including FF and VT characters? In +> the past, people often printed source code on paper. In the 1980s, I saw some +> source code that included a FF character inserted between functions. It +> resulted in a page break, so each function started at the top of the page. As +> far as a VT character goes, I have never seen it in the source code. + +The token type of delimiters is `TokenType.DELIMITER`. + +## Comments + +A comment also can be a delimiter, because C preprocessors replace each comment +with a space character. + +There are two types of comments. The one starts with `/*` and ends with `*/`. +The other starts with `//` and ends with a newline character. No comment can be +inside a character constant, a string literal, a standard header name, or a +filename in either case. + +The content of the token can be incomplete. For example, it may not terminate, +and so on. + +The token type of comments is `TokenType.COMMENT`. + +## Identifiers + +An identifier is a preprocessing token. + +The first character of an identifier name must be one of: + +- an underscore character or an uppercase or lowercase letter (`[_A-Za-z]`) +- universal character names (`\uXXXX` or `\UXXXXXXXX`, `X` is a hexadecimal + digit) +- other implementation-defined characters + +The second and subsequent character must be one of them or a digit (`[0-9]`). + +The _other implementation-defined characters_ that `LexicalParser`'s +implementation defines are of +[Unicode Identifier](https://unicode.org/reports/tr31/) that is as follows: + +- The first character: a character with which the + [Character.isUnicodeIdentifierStart(int)][isUnicodeIdentifierStart] + method returns `true` +- The second and subsequent character: a character with which the + [Character.isUnicodeIdentifierPart(int)][isUnicodeIdentifierPart] + method returns `true` + +So, the lexical parser can parse the following C code: + +```c +char *\U0001f431 = "cat"; +``` + +However, it does NOT support the following code because Unicode Identifier does +not contain the emoji characters such as ๐Ÿฑ: + +```c +char *๐Ÿฑ = "cat"; +``` + +Note that the recent famous C compilers (like GCC, Clang, etc.) can compile the +code where an identifier contains emoji characters like this. + +The token type of identifiers is `TokenType.IDENTIFIER`. + +## Reserved words + +Reserved words are equivalent to identifiers, but they are in the set of +keywords, which you can specify with the factory method of `LexicalParser`. + +The token type of reserved words is `TokenType.RESERVED`. + +## Character constants + +A character constant is a preprocessing token. + +It consists of one or more characters enclosed in single quotes. The quotes may +follow a prefix either `L`, `u`, or `U`. It may contain +[escape sequences][wikipedia-escape-character]. It may not contain a newline +character. + +The content of the token can be incomplete. For example, it may not terminate, +it may contain no character, two or more characters, or invalid escape +sequences inside the single quotes, and so on. + +The token type of character constants is `TokenType.CHARACTER`. + +## String literals + +A string literal is a preprocessing token. + +It consists of zero or more characters enclosed in double quotes. The quotes +may follow a prefix either `L`, `u`, `U`, or `u8`. It may contain +[escape sequences][wikipedia-escape-character]. It may not contain a newline +character. + +The content of the token can be incomplete. For example, it may not terminate, +it may contain invalid escape sequences inside the double quotes, and so on. + +The token type of string literals is `TokenType.STRING`. + +## Preprocessing numbers + +A preprocessing number is a preprocessing token. + +It includes all integer and floating-point constants but does other forms +except them. + +The content of the token can be incomplete. For example, it may not represent +both integer and floating-point constants, and so on. + +The token type of preprocessing numbers is `TokenType.NUMBER`. + +## Operators and punctuators + +Operator or punctuator tokens are preprocessing tokens. The following table +lists valid tokens of which the type is `TokenType.OPERATOR`: + +```plaintext ++ - * / % ++ -- == != +> < >= <= ! && || ~ & +| ^ << >> = += -= *= /= +%= &= |= ^= <<= >>= -> . ? +``` + +Note that these are preprocessing tokens, not C operators. For example, +`sizeof` is an operator in C, but a reserved word (or an identifier) as a +preprocessing token. + +The following table lists all valid tokens of which the type is +`TokenType.PUNCTUATOR`: + +```plaintext +( ) [ ] { } : +; , ... <: :> <% %> +``` + +The lexical parser specially treats the four tokens: `#`, `%:`, `##`, and +`%:%:`. The type of them is `TokenType.OPERATOR` in directive lines. Otherwise, +`#` and `%:` are of type `TokenType.DIRECTIVE`, `##` and `%:%:` are of type +`TokenType.UNKNOWN` as follows: + +| Tokens | In directive lines | Otherwise | +|:---:|:---:|:---:| +| `#` `%:` | `TokenType.OPERATOR` | `TokenType.DIRECTIVE` | +| `##` `%:%:` | `TokenType.OPERATOR` | `TokenType.UNKNOWN` | + +The following table lists all tokens that are digraphs: + +| Token | Equivalent | +| :---: | :---: | +| `<:` | `[` | +| `:>` | `]` | +| `<%` | `{` | +| `%>` | `}` | +| `%:` | `#` | +| `%:%:` | `##` | + +The lexical parser replaces the digraphs with their equivalents. The +substituted characters have the child characters that represent the replaced +ones. + +## Directives + +A directive token consists of a number sign (or hash) character (`#`) and the +child tokens. The null directive has no child tokens. + +The child tokens must include a directive name, arguments (depending on the +directive name), and the end of the directive (that is a newline character). +They also may include delimiters and comments. The last of them must be the end +of the directive. + +The content of the child tokens can be incomplete. For example, they may +represent an invalid directive, they may not end with the end of the directive, +and so on. + +The token type of directives is `TokenType.DIRECTIVE`. + +The tokens that represent the directive names must have the content which is +either: `define`, `undef`, `include`, `if`, `ifdef`, `ifndef`, `else`, `elif`, +`endif`, `line`, `error`, or `pragma`. Their token type is +`TokenType.DIRECTIVE_NAME`. + +The tokens that represents the end of the directive must have a newline +character as the content. Their token type is `TokenType.DIRECTIVE_END`. + +### Include directives + +When the directive name equals `include`, the argument must be either: + +- a standard header name between angle brackets (`<` and `>`) +- a filename between double quotes (`"` and `"`) +- any other form that expands to a standard header name or a filename after + macro replacement + +A standard header name and a filename are preprocessing tokens. + +The content of the tokens can be incomplete. For example, they may not +terminate, and so on. + +The token types of standard header names and filenames are +`TokenType.STANDARD_HEADER` and `TokenType.FILENAME`, respectively. + +## Unknown tokens + +When the lexical parser encounters characters that do not fit the above +description, it returns an unknown token containing them. + +The token type of unknown tokens is `TokenType.UNKNOWN`. + +## API Reference + +- [com.maroontress.clione][apiref-maroontress.clione] module + +[isUnicodeIdentifierPart]: + https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/lang/Character.html#isUnicodeIdentifierPart(int) +[isUnicodeIdentifierStart]: + https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/lang/Character.html#isUnicodeIdentifierStart(int) +[apiref-maroontress.clione]: + https://maroontress.github.io/Clione-Java/api/latest/html/index.html +[wikipedia-trigraph]: + https://en.wikipedia.org/wiki/Digraphs_and_trigraphs#C +[wikipedia-escape-character]: + https://en.wikipedia.org/wiki/Escape_sequences_in_C#Table_of_escape_sequences +[wikipedia-phases-of-translation]: + https://en.wikipedia.org/wiki/C_preprocessor#Phases diff --git a/build.gradle b/build.gradle new file mode 100644 index 0000000..2244fcb --- /dev/null +++ b/build.gradle @@ -0,0 +1,79 @@ +plugins { + id 'java' + id 'jacoco' + id 'checkstyle' + id 'maven-publish' +} + +group 'com.maroontress' +version '1.0' + +repositories { + mavenCentral() +} + +dependencies { + testImplementation 'org.hamcrest:hamcrest:2.2' + testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' + testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.2' +} + +compileJava { + options.encoding = 'UTF-8' + sourceCompatibility = 11 + targetCompatibility = 11 +} + +compileTestJava { + options.encoding = 'UTF-8' + sourceCompatibility = 14 +} + +test { + useJUnitPlatform() + finalizedBy jacocoTestReport +} + +jacocoTestReport { + dependsOn test + reports { + xml.required = false + csv.required = false + html.outputLocation = layout.buildDirectory.dir('reports/jacoco-html') + } +} + +java { + withJavadocJar() + withSourcesJar() +} + +javadoc { + options.addStringOption("locale", "en-US") +} + +checkstyle { + toolVersion '9.2.1' +} + +checkstyleMain { + configFile = file("${rootDir}/config/checkstyle/main.xml") +} + +checkstyleTest { + configFile = file("${rootDir}/config/checkstyle/test.xml") +} + +publishing { + repositories { + maven { + name = 'LocalFile' + url = uri("${localMavenRepo}") + } + } + publications { + gpr(MavenPublication) { + from(components.java) + } + } +} diff --git a/config/checkstyle/main.xml b/config/checkstyle/main.xml new file mode 100644 index 0000000..e3d7395 --- /dev/null +++ b/config/checkstyle/main.xml @@ -0,0 +1,921 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/config/checkstyle/test.xml b/config/checkstyle/test.xml new file mode 100644 index 0000000..3746b6e --- /dev/null +++ b/config/checkstyle/test.xml @@ -0,0 +1,891 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000..a4afdd9 --- /dev/null +++ b/gradle.properties @@ -0,0 +1,3 @@ +org.gradle.jvmargs=-Duser.language=en -Duser.country=US +file.encoding=utf-8 +localMavenRepo="${projectRoot}/maven" diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000..7454180 Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..ffed3a2 --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,5 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-7.2-bin.zip +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew new file mode 100755 index 0000000..76ccb9c --- /dev/null +++ b/gradlew @@ -0,0 +1,234 @@ +#!/bin/sh + +# +# Copyright ? 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions แ$varโ, แ${var}โ, แ${var:-default}โ, แ${var+SET}โ, +# แ${var#prefix}โ, แ${var%suffix}โ, and แ$( cmd )โ; +# * compound commands having a testable exit status, especially แcaseโ; +# * various built-in commands including แcommandโ, แsetโ, and แulimitโ. +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# +############################################################################## + +# Attempt to set APP_HOME + +# Resolve links: $0 may be a link +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac +done + +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit + +APP_NAME="Gradle" +APP_BASE_NAME=${0##*/} + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD=maximum + +warn () { + echo "$*" +} >&2 + +die () { + echo + echo "$*" + echo + exit 1 +} >&2 + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD=$JAVA_HOME/jre/sh/java + else + JAVACMD=$JAVA_HOME/bin/java + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD=java + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac +fi + +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + + # Now convert the arguments - kludge to limit ourselves to /bin/sh + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) + fi + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg + done +fi + +# Collect all arguments for the java command; +# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of +# shell script including quotes and variable substitutions, so put them in +# double quotes to make sure that they get re-expanded; and +# * put everything else in single quotes, so that it's not re-expanded. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 0000000..107acd3 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,89 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/settings.gradle b/settings.gradle new file mode 100644 index 0000000..09f6633 --- /dev/null +++ b/settings.gradle @@ -0,0 +1 @@ +rootProject.name = 'clione' diff --git a/src/main/java/com/maroontress/clione/Keywords.java b/src/main/java/com/maroontress/clione/Keywords.java new file mode 100644 index 0000000..4b637cb --- /dev/null +++ b/src/main/java/com/maroontress/clione/Keywords.java @@ -0,0 +1,62 @@ +package com.maroontress.clione; + +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + Provides the reserved words of C and the directive names. + + @see + Wikipedia, Reserved words +*/ +public final class Keywords { + + /** The unmodifiable set of keywords defined in C89. */ + public static final Set C89 = C89Keywords.ALL; + + /** The unmodifiable set of keywords defined in C99. */ + public static final Set C99 = C99Keywords.ALL; + + /** The unmodifiable set of keywords defined in C11. */ + public static final Set C11 = C11Keywords.ALL; + + /** The unmodifiable set of preprocessing directive names. */ + public static final Set PP_DIRECTIVE_NAMES = Set.of( + "include", + "define", + "if", "ifdef", "ifndef", "elif", "else", "endif", + "line", + "error", + "pragma"); + + /** Prevents the class from being instantiated. */ + private Keywords() { + throw new AssertionError(); + } + + private static Set union(Set s1, Set s2) { + return Stream.concat(s1.stream(), s2.stream()) + .collect(Collectors.toUnmodifiableSet()); + } + + private static class C89Keywords { + private static final Set ALL = Set.of("auto", "break", "case", + "char", "const", "continue", "default", "do", "double", "else", + "enum", "extern", "float", "for", "goto", "if", "int", "long", + "register", "return", "short", "signed", "sizeof", "static", + "struct", "switch", "typedef", "union", "unsigned", "void", + "volatile", "while"); + } + + private static class C99Keywords { + private static final Set ALL = union(C89Keywords.ALL, Set.of( + "_Bool", "_Complex", "_Imaginary", "inline", "restrict")); + } + + private static class C11Keywords { + private static final Set ALL = union(C99Keywords.ALL, Set.of( + "_Alignas", "_Alignof", "_Atomic", "_Generic", "_Noreturn", + "_Static_assert", "_Thread_local")); + } +} diff --git a/src/main/java/com/maroontress/clione/LexicalParser.java b/src/main/java/com/maroontress/clione/LexicalParser.java new file mode 100644 index 0000000..11e42a4 --- /dev/null +++ b/src/main/java/com/maroontress/clione/LexicalParser.java @@ -0,0 +1,106 @@ +package com.maroontress.clione; + +import java.io.IOException; +import java.io.Reader; +import java.util.Collection; +import java.util.Optional; +import com.maroontress.clione.impl.DefaultLexicalParser; + +/** + The lexical parser. + +

The {@link LexicalParser} object creates and returns a token from the + stream of the source file. It often extracts the ones from the source file, + but trigraph and digraph substitution and line concatenation may result in + tokens that are not in the source file. It returns an empty token when it + finally reaches the end of the source file.

+ +

The {@link Token} objects that the {@link #next()} method returns are + the preprocessing tokens. So, the evaluation is necessary before using + their content. In other words, they can be incomplete according to the + token type. For example, the string literal or comment may not terminate, + the preprocessing number may not represent valid integer and floating-point + constants, and so on.

+*/ +public interface LexicalParser extends AutoCloseable { + + /** {@inheritDoc} */ + @Override + void close() throws IOException; + + /** + Returns the character representing EOF. + +

Note that there is no need for this method in most cases. + If you want to detect when the line concatenation (a backslash + followed by a newline character) is immediately followed by EOF, + you can do so as follows:

+
+        void parse(LexicalParser parser) {
+            for (;;) {
+                var maybeToken = parser.next();
+                if (maybeToken.isEmpty()) {
+                    var maybeEof = parser.getEof();
+                    assert(maybeEof.isPresent());
+                    var eof = maybeEof.get();
+                    assert(eof.isEof());
+                    var list = eof.getChildren();
+                    if (list.size() > 0) {
+                        var m = "backslash-newline at end of file";
+                        System.err.println(eof.getSpan() + ": warning: " + m);
+                    }
+                    break;
+                }
+                ...
+            }
+        }
+ + @return The character representing EOF. Or {@link Optional#empty()} + if this parser has not yet reached EOF. + @throws IOException If an I/O error occurs. + */ + Optional getEof() throws IOException; + + /** + Returns the current location of the source file. + + @return The current location. + */ + SourceLocation getLocation(); + + /** + Returns the next token. + + @return The next token. Or {@link Optional#empty()} if this parser + reaches EOF. + @throws IOException If an I/O error occurs. + */ + Optional next() throws IOException; + + /** + Returns a new {@link LexicalParser} object. + +

The instance considers {@link Keywords#C11} as reserved + keywords.

+ + @param reader The reader that provides the stream of the source file. + @return The new {@link LexicalParser} object. + */ + static LexicalParser of(Reader reader) { + return new DefaultLexicalParser(reader); + } + + /** + Returns a new {@link LexicalParser} object with the specified reserved + words. + + @param reader The reader that provides the stream of the source file. + @param reservedWords The collection that contains reserved words. + Note that the constructor copies the collection, so changes to the + collection do not affect this instance. + @return The new {@link LexicalParser} object. + */ + static LexicalParser of(Reader reader, Collection reservedWords) { + return new DefaultLexicalParser(reader, reservedWords); + } +} diff --git a/src/main/java/com/maroontress/clione/SourceChar.java b/src/main/java/com/maroontress/clione/SourceChar.java new file mode 100644 index 0000000..9527518 --- /dev/null +++ b/src/main/java/com/maroontress/clione/SourceChar.java @@ -0,0 +1,95 @@ +package com.maroontress.clione; + +import java.util.List; +import com.maroontress.clione.impl.Eof; + +/** + A character of the source file or EOF. + +

The {@link SourceChar} object represents a character that composes the + token or EOF. It may also have one or more child characters in some cases. + For example, it is the case that it represents:

+ +
    +
  • the character which is substituted for any digraph or trigraph + sequence
  • +
  • the character that follows a backslash ({@code \}) at the end of the + line
  • +
+*/ +public interface SourceChar { + + /** The empty unmodifiable list of {@link SourceChar} objects. */ + List EMPTY_LIST = List.of(); + + /** + The {@link SourceChar} object representing the end of the source file. + +

This object behaves as follows:

+
    +
  • The {@link #isEof()} method returns {@code true}
  • +
  • The {@link #toChar()} and {@link #getSpan()} methods throw an + {@link IllegalStateException}
  • +
  • The {@link #getChildren()} method returns {@link #EMPTY_LIST}
  • +
+ +

Do not compare a {@link SourceChar} object with this object to + determine whether it is EOF or not. Use the {@link #isEof()} method + instead.

+ +

Note that this is an immutable object.

+ */ + SourceChar STATIC_EOF = new Eof() { + @Override + public SourceSpan getSpan() { + throw new IllegalStateException(); + } + + @Override + public List getChildren() { + return EMPTY_LIST; + } + }; + + /** + Returns whether this object represents EOF. + + @return {@code true} if this represents EOF. + */ + boolean isEof(); + + /** + Returns a {@code char} value corresponding to this object. + +

This method throws an {@link IllegalStateException} if this object + represents EOF.

+ + @return The {@code char} value. + @throws IllegalStateException If this object represents EOF. + */ + char toChar(); + + /** + Returns location of this object in the source file. + +

If this object has its child characters, its location depends on + their location. So, its location may represent a range of the + characters.

+ +

This method throws an {@link IllegalStateException} if this object + represents EOF.

+ + @return The location. + @throws IllegalStateException If this object represents EOF and has no + child characters. + */ + SourceSpan getSpan(); + + /** + Returns the child characters. + + @return The unmodifiable list containing the child characters, or + {@link #EMPTY_LIST} if this object has no child characters. + */ + List getChildren(); +} diff --git a/src/main/java/com/maroontress/clione/SourceLocation.java b/src/main/java/com/maroontress/clione/SourceLocation.java new file mode 100644 index 0000000..a8cc3af --- /dev/null +++ b/src/main/java/com/maroontress/clione/SourceLocation.java @@ -0,0 +1,67 @@ +package com.maroontress.clione; + +/** + The location of the source file. + +

Note that the {@link SourceLocation} instance is an immutable + object.

+*/ +public final class SourceLocation { + + private final int line; + private final int column; + + /** + Creates a new instance. + + @param line The line number. + @param column The column number. + @throws IllegalArgumentException If the {@code line} or {@code column} + is less than or equal to zero. + */ + public SourceLocation(int line, int column) { + if (line <= 0) { + throw new IllegalArgumentException("line must be greater than 0"); + } + if (column <= 0) { + throw new IllegalArgumentException("column must be greater than 0"); + } + this.line = line; + this.column = column; + } + + /** + Returns the line number. + + @return The line number. + */ + public int getLine() { + return line; + } + + /** + Returns the column number. + + @return The column number. + */ + public int getColumn() { + return column; + } + + /** + Returns a new string representation of this location that is easy for + a person to read. + +

This method returns a string equal to the value of:

+
Lnn:mm
+

where {@code mm} and {@code nn} are positive integers, + {@code nn} represents the line number, + and {@code mm} represents the column number.

+ + @return The new string representation of this token. + */ + @Override + public String toString() { + return "L" + line + ":" + column; + } +} diff --git a/src/main/java/com/maroontress/clione/SourceSpan.java b/src/main/java/com/maroontress/clione/SourceSpan.java new file mode 100644 index 0000000..14382e5 --- /dev/null +++ b/src/main/java/com/maroontress/clione/SourceSpan.java @@ -0,0 +1,99 @@ +package com.maroontress.clione; + +/** + The representation of the contiguous range of the source file. + +

Note that the {@link SourceSpan} instance is an immutable object.

+*/ +public final class SourceSpan { + + private final SourceLocation start; + private final SourceLocation end; + + /** + Creates a new instance. + + @param start The start location of the range. + @param end The end location of the range. + */ + public SourceSpan(SourceLocation start, SourceLocation end) { + this.start = start; + this.end = end; + } + + /** + Creates a new instance representing the single character. + + @param where The start and end location of the range. + */ + public SourceSpan(SourceLocation where) { + this(where, where); + } + + /** + Creates a new instance representing the range between the specified + {@link SourceSpan}s (that includes both of them). + + @param first The start span. + @param last The end span. + */ + public SourceSpan(SourceSpan first, SourceSpan last) { + this(first.start, last.end); + } + + /** + Returns the start location of this range. + + @return The start location. + */ + public SourceLocation getStart() { + return start; + } + + /** + Returns the end location of this range. + + @return The end location. + */ + public SourceLocation getEnd() { + return end; + } + + /** + Returns a new string representation of this range that is easy for + a person to read. + +

This method returns a string equal to the value of either:

+
    +
  • {@code Ln1:xx--Ln2:xx}
  • +
  • {@code Ln1:m1--m2}
  • +
  • {@code Ln1:m1}
  • +
+

where {@code n1}, {@code n2}, {@code m1}, {@code m2}, and {@code xx} + are positive integers, {@code n1} < {@code n2}, {@code m1} < + {@code m2}, {@code n1} and {@code n2} represent the line number, + and {@code m1}, {@code m2}, and {@code xx} represent the column + number.

+ + @return The new string representation of this token. + */ + @Override + public String toString() { + var startLine = start.getLine(); + var endLine = end.getLine(); + if (startLine != endLine) { + return "L" + startLine + + ":" + start.getColumn() + + "--L" + endLine + + ":" + end.getColumn(); + } + var startColumn = start.getColumn(); + var endColumn = end.getColumn(); + return (startColumn == endColumn) + ? "L" + startLine + + ":" + startColumn + : "L" + startLine + + ":" + startColumn + + "--" + endColumn; + } +} diff --git a/src/main/java/com/maroontress/clione/Token.java b/src/main/java/com/maroontress/clione/Token.java new file mode 100644 index 0000000..5eac96a --- /dev/null +++ b/src/main/java/com/maroontress/clione/Token.java @@ -0,0 +1,98 @@ +package com.maroontress.clione; + +import java.util.Collection; +import java.util.List; + +/** + The preprocessing token. + +

{@link Token} objects can have children, which means they can be in a + tree structure. For tokens that the {@link LexicalParser#next()} method + returns, tokens of type {@link TokenType#DIRECTIVE} only have children.

+ +

The {@link Token} object has its type, span, and characters. The type is + one of the constants defined in {@link TokenType}, the span represents the + range of the source file where the token occurs, and the characters are + {@link SourceChar} objects that compose it.

+ +

Note that the {@link Token} object is an immutable object.

+*/ +public interface Token { + + /** + Returns a new string representing this token. + +

The string that this method returns does not have the clue of the + token type and does not include the content of the child tokens.

+ + @return The new string representing this token. + */ + String getValue(); + + /** + Returns a new span representing the range of this token in the source + file. + + @return A new span representing the range of this token. + */ + SourceSpan getSpan(); + + /** + Returns the characters that compose this token. + + @return The unmodifiable list containing the characters that compose + this token. + */ + List getChars(); + + /** + Returns the type of this token. + + @return The type of this token. + */ + TokenType getType(); + + /** + Returns the child tokens. + +

This method returns an empty list if this token has no child + tokens.

+ + @return The unmodifiable list containing the child tokens. + */ + List getChildren(); + + /** + Returns a new token that has the same content of this token but + has the specified token type. + + @param newType The token type of the new token. + @return The new token. + */ + Token withType(TokenType newType); + + /** + Returns a new token that has the same content of this token but + has the specified child tokens. + + @param newChildren The child tokens of the new token. + @return The new token. + */ + Token withChildren(Collection newChildren); + + /** + Returns a new string representation of this token that is easy for + a person to read. + +

This method returns a string equal to the value of:

+
+        "[value=" + getValue() + ", span=" + getSpan() + ", "
+            + "chars=" + getChars() + ", type=" + getType() + ", "
+            + "children=" + getChildren() + "]";
+        
+ + @return The new string representation of this token. + */ + @Override + String toString(); +} diff --git a/src/main/java/com/maroontress/clione/TokenType.java b/src/main/java/com/maroontress/clione/TokenType.java new file mode 100644 index 0000000..c52e77a --- /dev/null +++ b/src/main/java/com/maroontress/clione/TokenType.java @@ -0,0 +1,139 @@ +package com.maroontress.clione; + +import java.io.Reader; +import java.util.Collection; + +/** + The constants representing the token type. +*/ +public enum TokenType { + + /** + The character constant beginning with either {@code "'"}, {@code "u'"}, + {@code "U'"}, or {@code "L'"} and ending with a {@code "'"}. + +

The character constant can contain a escape sequence between single + quotes.

+ */ + CHARACTER, + + /** + The comment beginning with a slash followed by an asterisk + ({@code /}{@code *}) and ending with an asterisk followed by a slash + ({@code *}{@code /}) that is not inside a character constant, a string + literal, or a standard header name. + */ + COMMENT, + + /** + The delimiter that is a sequence of the delimiter characters containing + a space character ({@code ' '}) and a horizontal tab character + ({@code '\t'}). + */ + DELIMITER, + + /** + The directive beginning with a number character ({@code #}) and + ending with a newline character. + */ + DIRECTIVE, + + /** + The directive name that is an identifier followed by the number + character ({@code #}) that every directive begins with (except + delimiters). + +

The identifier of directive name can be either {@code include}, + {@code define}, {@code if}, {@code ifdef}, {@code ifndef}, + {@code elif}, {@code else}, {@code endif}, {@code line}, {@code error}, + or {@code pragma}.

+ */ + DIRECTIVE_NAME, + + /** The end of directive ({@code '\n'}). */ + DIRECTIVE_END, + + /** + The filename between double quotes ({@code "}) that follows an + {@code include} directive name. + +

Note that this differs from {@link #STRING}.

+ */ + FILENAME, + + /** + The identifier. + +

An identifier can contain the following characters:

+
    +
  • an underscore character or an uppercase or lowercase letter + ({@code [_A-Za-z]})
  • +
  • a digit ({@code [0-9]})
  • +
  • universal character names ({@code \}{@code uXXXX} or + {@code \}{@code UXXXXXXXX}, + where {@code X} is a hexadecimal digit)
  • +
  • other implementation-defined characters
  • +
+

However, the first character of an identifier cannot be a digit.

+ */ + IDENTIFIER, + + /** + The number that is an integer constant or a floating-point number + constant. + */ + NUMBER, + + /** + The operator. + +

The operator is either: {@code +}, {@code -}, {@code *}, {@code /}, + {@code %}, {@code ++}, {@code --}, {@code ==}, {@code !=}, + {@code >}, {@code <}, {@code >=}, {@code <=}, + {@code !}, {@code &&}, {@code ||}, + {@code ~}, {@code &}, {@code |}, {@code ^}, {@code >>}, {@code <<}, + {@code =}, {@code +=}, {@code -=}, {@code *=}, {@code /=}, {@code %=}, + {@code &=}, {@code |=}, {@code ^=}, + {@code <<=}, {@code >>=}, + {@code '.'}, {@code '->'}, or {@code '?'}.

+ */ + OPERATOR, + + /** + The punctuator that is either {@code '['}, {@code ']'}, {@code '('}, + {@code ')'}, '{', '}', {@code ','}, + {@code ';'}, {@code ':'}, or {@code '...'}. + +

Note that some punctuators may be C operators according to the + syntactic context.

+ */ + PUNCTUATOR, + + /** + The reserved words that are identifiers but the string collection + specified with the factory method of + {@link LexicalParser#of(Reader, Collection)}. + */ + RESERVED, + + /** + The standard header name between angle brackets ({@code '<'} and + {@code '>'}) that follow an {@code include} directive. + +

For example: {@code }, {@code }.

+ */ + STANDARD_HEADER, + + /** + The string literal beginning with either {@code '"'}, {@code 'u"'}, + {@code 'U"'}, {@code 'L"'}, or {@code 'u8"'} and ending with a + {@code '"'}. + +

The string literal can contain escape sequences between double + quotes.

+ */ + STRING, + + /** The unknown token that is invalid in the syntax. */ + UNKNOWN, +} diff --git a/src/main/java/com/maroontress/clione/impl/AbstractReader.java b/src/main/java/com/maroontress/clione/impl/AbstractReader.java new file mode 100644 index 0000000..2e74320 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/AbstractReader.java @@ -0,0 +1,35 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.io.Reader; + +/** + An abstract {@link Reader} with the only one abstract method that is + {@link Reader#read()}. +*/ +public abstract class AbstractReader extends Reader { + + /** {@inheritDoc} */ + @Override + public final int read(char[] array, int offset, int length) + throws IOException { + if (length == 0) { + return 0; + } + if (length < 0 || offset < 0) { + throw new IndexOutOfBoundsException(); + } + var n = offset + length; + if (n < offset || n > array.length) { + throw new IndexOutOfBoundsException(); + } + for (var k = offset; k < n; ++k) { + var i = read(); + if (i == -1) { + return k - offset; + } + array[k] = (char) i; + } + return length; + } +} diff --git a/src/main/java/com/maroontress/clione/impl/Case.java b/src/main/java/com/maroontress/clione/impl/Case.java new file mode 100644 index 0000000..0aa3a9f --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Case.java @@ -0,0 +1,153 @@ +package com.maroontress.clione.impl; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import com.maroontress.clione.TokenType; + +/** + The mapping of one or more characters to a tokenizer. + +

Note that the {@link Case} instance is an immutable object.

+*/ +public final class Case { + + private final Set charSet; + private final Tokenizer tokenizer; + + private Case(Set charSet, Tokenizer tokenizer) { + this.charSet = Set.copyOf(charSet); + this.tokenizer = tokenizer; + } + + /** + Returns the unmodifiable set of characters that this case maps. + + @return The unmodifiable set of characters. + */ + public Set getCharSet() { + return charSet; + } + + /** + Returns the tokenizer that this case maps characters to. + + @return The tokenizer. + */ + public Tokenizer getTokenizer() { + return tokenizer; + } + + /** + Returns a new case that starts with the specified character and has + the specified tokenizer. + + @param c The character that the case starts with. + @param tokenizer The tokenizer associated with the case. + @return The new case. + */ + public static Case of(char c, Tokenizer tokenizer) { + return new Case(Set.of(c), tokenizer); + } + + /** + Returns a new case that maps the specified character set to the + specified tokenizer. + + @param set The set of characters to map + @param tokenizer The tokenizer to map characters to. + @return The new case. + */ + public static Case of(Set set, Tokenizer tokenizer) { + return new Case(set, tokenizer); + } + + /** + Returns a new case that starts with the specified character + and that one of the specified cases may follow. + + @param c The character that the case starts with. + @param otherwise The token type that the tokenizer returns when none + of the specified cases follows the character {@code c}. + @param cases The cases that may follow the character {@code c}. + @return The new case. + */ + public static Case of(char c, TokenType otherwise, Case... cases) { + var mapper = newMapper(cases); + return Case.of(c, x -> x.tryReadToken(mapper, otherwise)); + } + + /** + Returns a new case that starts with the specified character + and that one of the specified cases may follow. + + @param c The character that the case starts with. + @param otherwise The tokenizer returns when none of the specified + cases follows the character {@code c}. + @param cases The cases that may follow the character {@code c}. + @return The new case. + */ + public static Case of(char c, Tokenizer otherwise, Case... cases) { + var mapper = newMapper(cases); + return Case.of(c, x -> x.tryReadToken(mapper, otherwise)); + } + + /** + Returns a new case that starts with one of characters in the + specified set and that one of the specified cases may follow it. + + @param set The set of characters, one of which the case starts with. + @param otherwise The token type that the tokenizer returns when none + of the specified cases follows the one of the characters. + @param cases The cases that may follow one of the characters. + @return The new case. + */ + public static Case of(Set set, TokenType otherwise, + Case... cases) { + var mapper = newMapper(cases); + return Case.of(set, x -> x.tryReadToken(mapper, otherwise)); + } + + private static Map newMap(List list) { + var map = new HashMap(); + for (var i : list) { + var set = i.getCharSet(); + var reader = i.getTokenizer(); + for (var c : set) { + map.putIfAbsent(c, reader); + } + } + return Map.copyOf(map); + } + + /** + Returns a new mapping function that takes a character and returns a + tokenizer associated with the character with the specified cases. + + @param cases The cases. + @return The new map. + */ + public static Mapper newMapper(Case... cases) { + var map = Case.newMap(List.of(cases)); + return map::get; + } + + /** + The function that that takes a character and returns a tokenizer + associated with the character. + */ + @FunctionalInterface + public interface Mapper { + /** + Returns the tokenizer with which the specified character is + associated, or {@code null} if there is no tokenizer associated + with the character. + + @param c The character. + @return The tokenizer associated with the character {@code c}, + or {@code null} if there is no tokenizer associated with {@code c}. + */ + Tokenizer get(char c); + } +} diff --git a/src/main/java/com/maroontress/clione/impl/Chars.java b/src/main/java/com/maroontress/clione/impl/Chars.java new file mode 100644 index 0000000..1156908 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Chars.java @@ -0,0 +1,195 @@ +package com.maroontress.clione.impl; + +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + Provides immutable {@link Set} objects of a character and utility methods + for determining a character's category (that is letter, digit, and so on). +*/ +public final class Chars { + + /** A character set containing only digits. */ + public static final Set DIGIT_SET = Sets.DIGITS; + + /** A character set containing only delimiters inside a directive. */ + public static final Set DIRECTIVE_DELIMITER_SET + = Sets.DIRECTIVE_DELIMITERS; + + /** A character set containing only delimiters. */ + public static final Set DELIMITER_SET = Sets.DELIMITERS; + + /** + A set of characters which can be the first character of an identifier. + */ + public static final Set FIRST_OF_IDENTIFIER_SET = union( + List.of(Set.of('_'), + Sets.UPPER_CASE_LETTERS, + Sets.LOWER_CASE_LETTERS)); + + /** + A set of characters which an identifier can contain. Note that this + does not contain Unicode non-digit characters. + */ + public static final Set IDENTIFIER_SET = union( + List.of(Set.of('_'), + Sets.UPPER_CASE_LETTERS, + Sets.LOWER_CASE_LETTERS, + Sets.DIGITS)); + + private static final Set PP_NUMBER_SET = union( + List.of(Set.of('.'), + Sets.UPPER_CASE_LETTERS, + Sets.LOWER_CASE_LETTERS, + Sets.DIGITS)); + + private static final Set HEX_DIGIT_SET = union( + List.of(Sets.DIGITS, + Sets.newCharSetWithRange('A', 'F'), + Sets.newCharSetWithRange('a', 'f'))); + + /** Prevents the class from being instantiated. */ + private Chars() { + throw new AssertionError(); + } + + /** + Determines if the specified character is a delimiter inside a + directive. + + @param c The character to be tested. + @return {@code true} if the character is a delimiter inside a + directive. + */ + public static boolean isDirectiveDelimiter(char c) { + return DIRECTIVE_DELIMITER_SET.contains(c); + } + + /** + Determines if the specified character is a delimiter. + + @param c The character to be tested. + @return {@code true} if the character is a delimiter. + */ + public static boolean isDelimiter(char c) { + return DELIMITER_SET.contains(c); + } + + /** + Determines if the specified character composes a preprocessing number + (except {@code '+'} and {@code '-'} following either {@code e} or + {@code E}). + + @param c The character to be tested. + @return {@code true} if the character is composes a preprocessing + number. + */ + public static boolean isPreprocessingNumber(char c) { + return PP_NUMBER_SET.contains(c); + } + + /** + Determines if the specified character is a digit ({@code [0-9]}). + + @param c The character to be tested. + @return {@code true} if the character is a digit. + */ + public static boolean isDigit(char c) { + return DIGIT_SET.contains(c); + } + + /** + Determines if the specified character may be part of a number suffix. + ({@code [uUlL]}). + + @param c The character to be tested. + @return {@code true} if the character be part of a number suffix. + */ + public static boolean isNumberSuffix(char c) { + return c == 'u' || c == 'U' + || c == 'l' || c == 'L'; + } + + /** + Determines if the specified character is a hexadecimal digit + ({@code [0-9a-fA-F]}). + + @param c The character to be tested. + @return {@code true} if the character is a hexadecimal digit. + */ + public static boolean isHexDigit(char c) { + return HEX_DIGIT_SET.contains(c); + } + + /** + Determines if the specified character is an octal digit + ({@code [0-7]}). + + @param c The character to be tested. + @return {@code true} if the character is an octal digit. + */ + public static boolean isOctalDigit(char c) { + return c >= '0' && c <= '7'; + } + + /** + Determines if the specified character may be the first character in an + identifier (except Universal Character Names and other + implementation-defined characters). + + @param c The character to be tested. + @return {@code true} if the character may start an identifier. + */ + public static boolean isFirstName(char c) { + return FIRST_OF_IDENTIFIER_SET.contains(c); + } + + /** + Determines if the specified character may be part of an identifier as + other than the first character (except Universal Character Names and + other implementation-defined characters). + + @param c The character to be tested. + @return {@code true} if the character be part of an identifier. + */ + public static boolean isName(char c) { + return IDENTIFIER_SET.contains(c); + } + + private static Set union(List> all) { + return all.stream() + .flatMap(Collection::stream) + .collect(Collectors.toUnmodifiableSet()); + } + + private static class Sets { + private static final Set UPPER_CASE_LETTERS + = newCharSetWithRange('A', 'Z'); + + private static final Set LOWER_CASE_LETTERS + = newCharSetWithRange('a', 'z'); + + private static final Set DIGITS + = newCharSetWithRange('0', '9'); + + private static final Set DIRECTIVE_DELIMITERS + = Set.of(' ', '\t'); + + private static final Set DELIMITERS + = Set.of(' ', '\t', '\n', '\f', '\u000b'); + + private static final Set ESCAPE_SEQUENCE + = Set.of('a', 'b', 'e', 'f', 'n', 'r', 't', 'v', + '\\', '\'', '"', '?'); + + private static Set newCharSetWithRange( + char start, char end) { + return IntStream.rangeClosed(start, end) + .mapToObj(c -> (char) c) + .collect(Collectors.toUnmodifiableSet()); + } + } +} diff --git a/src/main/java/com/maroontress/clione/impl/DefaultLexicalParser.java b/src/main/java/com/maroontress/clione/impl/DefaultLexicalParser.java new file mode 100644 index 0000000..67b2c50 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/DefaultLexicalParser.java @@ -0,0 +1,256 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import com.maroontress.clione.Keywords; +import com.maroontress.clione.LexicalParser; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceLocation; +import com.maroontress.clione.Token; +import com.maroontress.clione.TokenType; + +/** + The default implementation of {@link LexicalParser}. +*/ +public final class DefaultLexicalParser implements LexicalParser { + + private final Source source; + private final Set reservedWords; + + /** + Creates a new instance. + +

The instance considers {@link Keywords#C11} as reserved + words.

+ + @param reader The reader that provides the stream of the source file. + */ + public DefaultLexicalParser(Reader reader) { + this(reader, Keywords.C11); + } + + /** + Creates a new instance. + + @param reader The reader that provides the stream of the source file. + @param reservedWords The collection that contains reserved keywords. + Note that the constructor copies the collection, so changes to the + collection do not affect this instance. + */ + public DefaultLexicalParser(Reader reader, + Collection reservedWords) { + this(reader, Set.copyOf(reservedWords)); + } + + private DefaultLexicalParser(Reader reader, Set reservedWords) { + source = new PhaseTwoSource(new PhaseOneSource( + new ReaderSource(reader))); + this.reservedWords = reservedWords; + } + + /** {@inheritDoc} */ + @Override + public void close() throws IOException { + source.close(); + } + + /** {@inheritDoc} */ + @Override + public Optional getEof() throws IOException { + var c = source.getChar(); + if (!c.isEof()) { + source.ungetChar(c); + return Optional.empty(); + } + return Optional.of(c); + } + + /** {@inheritDoc} */ + @Override + public SourceLocation getLocation() { + return source.getLocation(); + } + + /** {@inheritDoc} */ + @Override + public Optional next() throws IOException { + return Optional.ofNullable(newToken()); + } + + private Token newToken() throws IOException { + var x = new Transcriber(source); + var type = readToken(x); + if (type == null) { + return null; + } + var token = x.toToken(type); + if (type == TokenType.IDENTIFIER) { + return reservedWords.contains(token.getValue()) + ? token.withType(TokenType.RESERVED) + : token; + } + if (type == TokenType.DIRECTIVE) { + return newDirectiveToken(token); + } + return token; + } + + private Token newDirectiveToken(Token token) throws IOException { + var children = newDirectiveChildTokens(); + return token.withChildren(children); + } + + private List newDirectiveChildTokens() throws IOException { + var children = new ArrayList(); + for (;;) { + var child = newDirectiveChildToken(); + if (child == null) { + return children; + } + var type = child.getType(); + if (type == TokenType.DIRECTIVE_END) { + children.add(child); + return children; + } + if (type == TokenType.DELIMITER || type == TokenType.COMMENT) { + children.add(child); + continue; + } + var value = child.getValue(); + if (!Keywords.PP_DIRECTIVE_NAMES.contains(value)) { + // INVALID + children.add(child); + addDirectiveTokens(children); + return children; + } + children.add(child.withType(TokenType.DIRECTIVE_NAME)); + if (value.equals("include")) { + addIncludeDirectiveTokens(children); + return children; + } + addDirectiveTokens(children); + return children; + } + } + + private void addDirectiveTokens(List list) + throws IOException { + for (;;) { + var token = newDirectiveChildToken(); + if (token == null) { + return; + } + list.add(token); + var type = token.getType(); + if (type == TokenType.DIRECTIVE_END) { + return; + } + } + } + + private void addIncludeDirectiveTokens(List list) + throws IOException { + for (;;) { + var token = newIncludeDirectiveChildToken(); + if (token == null) { + return; + } + list.add(token); + var type = token.getType(); + if (type == TokenType.DIRECTIVE_END) { + return; + } + if (type == TokenType.DELIMITER + || type == TokenType.COMMENT) { + continue; + } + addDirectiveTokens(list); + return; + } + } + + private Token newDirectiveChildToken() throws IOException { + return newChildToken(DefaultLexicalParser::readDirectiveToken); + } + + private Token newIncludeDirectiveChildToken() throws IOException { + return newChildToken(DefaultLexicalParser::readIncludeDirectiveToken); + } + + private Token newChildToken(NextTokenReader reader) throws IOException { + var x = new Transcriber(source); + var type = reader.apply(x); + if (type == null) { + return null; + } + var token = x.toToken(type); + if (type == TokenType.IDENTIFIER) { + return reservedWords.contains(token.getValue()) + ? token.withType(TokenType.RESERVED) + : token; + } + return token; + } + + private static TokenType readToken(Transcriber x) throws IOException { + return x.readTokenOtherwise(Switches.DEFAULT, + DefaultLexicalParser::readSymbol); + } + + private static TokenType readDirectiveToken(Transcriber x) + throws IOException { + return x.readTokenOtherwise(Switches.DIRECTIVE, + DefaultLexicalParser::readSymbol); + } + + private static TokenType readIncludeDirectiveToken(Transcriber x) + throws IOException { + return x.readTokenOtherwise(Switches.INCLUDE_DIRECTIVE, + DefaultLexicalParser::readSymbol); + } + + private static TokenType readSymbol(Transcriber x, SourceChar i) + throws IOException { + var s = x.getSource(); + var b = x.getBuilder(); + var c = i.toChar(); + if (Character.isHighSurrogate(c)) { + var j = s.getChar(); + if (j.isEof()) { + b.append(i); + return TokenType.UNKNOWN; + } + var n = j.toChar(); + if (!Character.isLowSurrogate(n)) { + s.ungetChar(j); + b.append(i); + return TokenType.UNKNOWN; + } + b.append(i); + b.append(j); + var u = Character.toCodePoint(c, n); + if (!Character.isUnicodeIdentifierStart(u)) { + return TokenType.UNKNOWN; + } + x.readIdentifier(); + return TokenType.IDENTIFIER; + } + if (Character.isUnicodeIdentifierStart(c)) { + b.append(i); + x.readIdentifier(); + return TokenType.IDENTIFIER; + } + b.append(i); + return TokenType.UNKNOWN; + } + + @FunctionalInterface + private interface NextTokenReader { + TokenType apply(Transcriber x) throws IOException; + } +} diff --git a/src/main/java/com/maroontress/clione/impl/DefaultToken.java b/src/main/java/com/maroontress/clione/impl/DefaultToken.java new file mode 100644 index 0000000..e62391c --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/DefaultToken.java @@ -0,0 +1,102 @@ +package com.maroontress.clione.impl; + +import java.util.Collection; +import java.util.List; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceSpan; +import com.maroontress.clione.Token; +import com.maroontress.clione.TokenType; + +/** + The implementation of a preprocessing token. + */ +public final class DefaultToken implements Token { + + private final List chars; + private final TokenType type; + private final List children; + + /** + Creates a new instance. + + @param chars The collection of {@link SourceChar} objects that compose + this token. It must not contain any character representing EOF. + @param type The token type of this token. + */ + public DefaultToken(Collection chars, TokenType type) { + this(chars, type, List.of()); + } + + private DefaultToken(Collection chars, TokenType type, + Collection children) { + this.chars = List.copyOf(chars); + this.type = type; + this.children = List.copyOf(children); + } + + /** {@inheritDoc} */ + @Override + public String getValue() { + var size = chars.size(); + var b = new StringBuilder(size); + for (var c : chars) { + b.append(c.toChar()); + } + return b.toString(); + } + + /** {@inheritDoc} */ + @Override + public SourceSpan getSpan() { + var start = chars.get(0).getSpan().getStart(); + if (children.isEmpty()) { + var last = chars.size() - 1; + var end = chars.get(last).getSpan().getEnd(); + return new SourceSpan(start, end); + } + var end = children.get(children.size() - 1) + .getSpan() + .getEnd(); + return new SourceSpan(start, end); + } + + /** {@inheritDoc} */ + @Override + public List getChars() { + return chars; + } + + /** {@inheritDoc} */ + @Override + public TokenType getType() { + return type; + } + + /** {@inheritDoc} */ + @Override + public List getChildren() { + return children; + } + + /** {@inheritDoc} */ + @Override + public Token withType(TokenType newType) { + return new DefaultToken(chars, newType, children); + } + + /** {@inheritDoc} */ + @Override + public Token withChildren(Collection newChildren) { + return new DefaultToken(chars, type, newChildren); + } + + /** {@inheritDoc} */ + @Override + public String toString() { + return "[value=" + getValue() + ", " + + "span=" + getSpan() + ", " + + "chars=" + chars + ", " + + "type=" + type + ", " + + "children=" + children + "]"; + } +} diff --git a/src/main/java/com/maroontress/clione/impl/DefaultTokenizer.java b/src/main/java/com/maroontress/clione/impl/DefaultTokenizer.java new file mode 100644 index 0000000..520d52f --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/DefaultTokenizer.java @@ -0,0 +1,45 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.TokenType; + +/** + The function that changes the state of the specified {@link Transcriber} + object with the specified {@link SourceChar} object, lets the + {@link Transcriber} object read characters from its source and store a new + token in its builder, and returns the token type of the stored token. + + @see Transcriber#readTokenOtherwise(Case.Mapper, DefaultTokenizer) + @see Tokenizer +*/ +@FunctionalInterface +public interface DefaultTokenizer { + /** + Returns the token type of the token composed of the specified + {@link SourceChar} and if needed the characters supplied from the + specified {@link Transcriber} object. + +

The transcriber may read characters from its source to build a + new token. It stores the building token in its {@link TokenBuilder} + object. So use {@link Transcriber#toToken(TokenType)} method to get + the new token object as follows:

+ +
+        Token newToken(Transcriber x, SourceChar c,
+                       DefaultTokenizer otherwise) throws IOException {
+            var type = otherwise.apply(x, c);
+            return x.toToken(type);
+        }
+ +

Note that this function does not return {@code null} unlike + {@link Tokenizer#apply(Transcriber)}.

+ + @param x The transcriber. + @param c The first character that the {@link Transcriber} has read from + its source but has not yet been stored to its builder. + @return The token type. + @throws IOException If an I/O error occurs. + */ + TokenType apply(Transcriber x, SourceChar c) throws IOException; +} diff --git a/src/main/java/com/maroontress/clione/impl/Digraphs.java b/src/main/java/com/maroontress/clione/impl/Digraphs.java new file mode 100644 index 0000000..46ad108 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Digraphs.java @@ -0,0 +1,140 @@ +package com.maroontress.clione.impl; + +import com.maroontress.clione.TokenType; + +/** + Provides the facility of digraph substitution. + +

In general, digraphs are sequences of two characters that appear in + source code and should be treated as if they were single characters. In + the C programming language, any digraph must always represent a full token + by itself. The following table lists all valid operator or punctuator + tokens represented with digraphs:

+ + + + + + + + + + + + +
Tokens represented with digraphs.
TokenEquivalent
<:[
:>]
<%{
%>}
%:#
%:%:##
+ +

Note that both {@code %:#} and {@code #%:} are not equivalent to + token {@code ##} or {@code %:%:}.

+ + @see + Wikipedia, Digraphs and trigraphs +*/ +public final class Digraphs { + + /** Prevents the class from being instantiated. */ + private Digraphs() { + throw new AssertionError(); + } + + /** + Substitutes a number sign (with which the preprocessing directive + starts) for the character sequence that the specified transcriber + stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#DIRECTIVE}). + */ + public static TokenType toDirective(Transcriber x) { + x.getBuilder().replaceDigraph('#'); + return TokenType.DIRECTIVE; + } + + /** + Substitutes a double number sign (that appears other than in the macro + declaration) for the character sequence that the specified transcriber + stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#UNKNOWN}). + */ + public static TokenType toUnknownDoubleNumberSign(Transcriber x) { + x.getBuilder().replaceDigraph('#', '#'); + return TokenType.UNKNOWN; + } + + /** + Substitutes a number sign (a preprocessing stringification operator + {@code #}) for the character sequence that the specified transcriber + stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#OPERATOR}). + */ + public static TokenType toStringificationOperator(Transcriber x) { + x.getBuilder().replaceDigraph('#'); + return TokenType.OPERATOR; + } + + /** + Substitutes a double number sign (a preprocessing token-pasting + operator {@code ##}) for the character sequence that the specified + transcriber stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#OPERATOR}). + */ + public static TokenType toTokenPastingOperator(Transcriber x) { + x.getBuilder().replaceDigraph('#', '#'); + return TokenType.OPERATOR; + } + + /** + Substitutes a right bracket for the character sequence that the + specified transcriber stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#PUNCTUATOR}). + */ + public static TokenType toRightBracket(Transcriber x) { + return toPunctuator(x, ']'); + } + + /** + Substitutes a left bracket for the character sequence that the + specified transcriber stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#PUNCTUATOR}). + */ + public static TokenType toLeftBracket(Transcriber x) { + return toPunctuator(x, '['); + } + + /** + Substitutes a left brace for the character sequence that the + specified transcriber stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#PUNCTUATOR}). + */ + public static TokenType toLeftBrace(Transcriber x) { + return toPunctuator(x, '{'); + } + + /** + Substitutes a right brace for the character sequence that the + specified transcriber stores in its builder. + + @param x The transcriber. + @return The token type ({@link TokenType#PUNCTUATOR}). + */ + public static TokenType toRightBrace(Transcriber x) { + return toPunctuator(x, '}'); + } + + private static TokenType toPunctuator(Transcriber x, char c) { + x.getBuilder().replaceDigraph(c); + return TokenType.PUNCTUATOR; + } +} diff --git a/src/main/java/com/maroontress/clione/impl/Eof.java b/src/main/java/com/maroontress/clione/impl/Eof.java new file mode 100644 index 0000000..f7ad98b --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Eof.java @@ -0,0 +1,31 @@ +package com.maroontress.clione.impl; + +import java.util.List; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceSpan; + +/** + Represents the abstract EOF. +*/ +public abstract class Eof implements SourceChar { + + /** {@inheritDoc} */ + @Override + public final boolean isEof() { + return true; + } + + /** {@inheritDoc} */ + @Override + public final char toChar() { + throw new IllegalStateException(); + } + + /** {@inheritDoc} */ + @Override + public abstract SourceSpan getSpan(); + + /** {@inheritDoc} */ + @Override + public abstract List getChildren(); +} diff --git a/src/main/java/com/maroontress/clione/impl/PhaseOneSource.java b/src/main/java/com/maroontress/clione/impl/PhaseOneSource.java new file mode 100644 index 0000000..b86da48 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/PhaseOneSource.java @@ -0,0 +1,98 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.util.Map; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceLocation; + +/** + This source reads characters from upstream source, replacing trigraph + sequences with the character that they represent. +*/ +public final class PhaseOneSource implements Source { + + private static final Map REPLACEMENT_MAP + = newReplacementMap(); + + private final Source source; + + /** + Creates a source replacing trigraph sequences. + + @param source The reader from which characters will be read. + */ + public PhaseOneSource(Source source) { + this.source = source; + } + + /** {@inheritDoc} */ + @Override + public void close() throws IOException { + source.close(); + } + + /** {@inheritDoc} */ + @Override + public SourceLocation getLocation() { + return source.getLocation(); + } + + /** {@inheritDoc} */ + @Override + public SourceChar getChar() throws IOException { + var first = read(); + if (first.isEof()) { + return first; + } + if (first.toChar() != '?') { + return first; + } + var second = read(); + if (second.isEof()) { + return first; + } + if (second.toChar() != '?') { + unread(second); + return first; + } + var third = read(); + if (third.isEof()) { + unread(second); + return first; + } + var c = REPLACEMENT_MAP.get(third.toChar()); + if (c == null) { + unread(third); + unread(second); + return first; + } + return SourceChars.of(first, second, third, c); + } + + /** {@inheritDoc} */ + @Override + public void ungetChar(SourceChar c) { + unread(c); + } + + private static Map newReplacementMap() { + return Map.of( + '=', '#', + '/', '\\', + '\'', '^', + '(', '[', + ')', ']', + '!', '|', + '<', '{', + '>', '}', + '-', '~'); + } + + private SourceChar read() throws IOException { + return source.getChar(); + } + + private void unread(SourceChar c) { + source.ungetChar(c); + } +} diff --git a/src/main/java/com/maroontress/clione/impl/PhaseTwoSource.java b/src/main/java/com/maroontress/clione/impl/PhaseTwoSource.java new file mode 100644 index 0000000..640dae8 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/PhaseTwoSource.java @@ -0,0 +1,93 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Function; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceLocation; + +/** + This source reads characters from upstream source, splicing lines ended + with the backslash (\). +*/ +public final class PhaseTwoSource implements Source { + + private final Source source; + private Function eofIdentity = this::initializeEof; + + /** + Creates a source splicing lines. + + @param source The upstream source. + */ + public PhaseTwoSource(Source source) { + this.source = source; + } + + /** {@inheritDoc} */ + @Override + public void close() throws IOException { + source.close(); + } + + /** {@inheritDoc} */ + @Override + public SourceLocation getLocation() { + return source.getLocation(); + } + + /** {@inheritDoc} */ + @Override + public SourceChar getChar() throws IOException { + var c = read(); + return !c.isEof() ? c : eofIdentity.apply(c); + } + + /** {@inheritDoc} */ + @Override + public void ungetChar(SourceChar c) { + source.ungetChar(c); + } + + private SourceChar read() throws IOException { + var list = SourceChar.EMPTY_LIST; + for (;;) { + var c = source.getChar(); + if (c.isEof()) { + return compose(list, c); + } + if (c.toChar() != '\\') { + return compose(list, c); + } + var next = source.getChar(); + if (next.isEof()) { + return compose(list, c); + } + if (next.toChar() != '\n') { + source.ungetChar(next); + return compose(list, c); + } + if (list.isEmpty()) { + list = new ArrayList<>(); + } + Collections.addAll(list, c, next); + } + } + + private SourceChar compose(List list, SourceChar c) { + if (list.isEmpty()) { + return c; + } + if (c.isEof()) { + return SourceChars.eof(list); + } + return SourceChars.of(list, c); + } + + private SourceChar initializeEof(SourceChar eof) { + eofIdentity = c -> eof; + return eof; + } +} diff --git a/src/main/java/com/maroontress/clione/impl/ReaderSource.java b/src/main/java/com/maroontress/clione/impl/ReaderSource.java new file mode 100644 index 0000000..514f7e8 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/ReaderSource.java @@ -0,0 +1,88 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayDeque; +import java.util.Deque; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceLocation; + +/** + This source reads characters from upstream reader, counting the line and + column number. +*/ +public final class ReaderSource implements Source { + + private final UnifiedNewlineReader reader; + private final Deque stack; + private int line = 1; + private int column = 1; + + /** + Creates a new source. + + @param reader The reader from which characters will be read. + */ + public ReaderSource(Reader reader) { + this.reader = new UnifiedNewlineReader(reader); + stack = new ArrayDeque<>(); + } + + /** {@inheritDoc} */ + @Override + public void close() throws IOException { + reader.close(); + } + + /** {@inheritDoc} */ + @Override + public SourceLocation getLocation() { + return new SourceLocation(line, column); + } + + /** {@inheritDoc} */ + @Override + public SourceChar getChar() throws IOException { + { + var c = stack.pollFirst(); + if (c != null) { + return c; + } + } + var i = reader.read(); + if (i == -1) { + return SourceChars.eof(); + } + var c = SourceChars.of((char) i, column, line); + if (i == '\n') { + column = 1; + ++line; + return c; + } + aidSurrogatePair(i); + ++column; + return c; + } + + /** {@inheritDoc} */ + @Override + public void ungetChar(SourceChar c) { + if (c.isEof()) { + throw new IllegalArgumentException("c is EOF"); + } + stack.addFirst(c); + } + + private void aidSurrogatePair(int i) throws IOException { + if (!Character.isHighSurrogate((char) i)) { + return; + } + var next = reader.read(); + if (next == -1) { + return; + } + var nextColumn = (Character.isLowSurrogate((char) next)) + ? column : column + 1; + stack.addFirst(SourceChars.of((char) next, nextColumn, line)); + } +} diff --git a/src/main/java/com/maroontress/clione/impl/Source.java b/src/main/java/com/maroontress/clione/impl/Source.java new file mode 100644 index 0000000..799e655 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Source.java @@ -0,0 +1,63 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.io.Reader; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceLocation; + +/** + Provides the stream of the source file. +*/ +public interface Source { + + /** + Closes this source and its upstream {@link Source} or {@link Reader}. + + @throws IOException If an I/O error occurs. + */ + void close() throws IOException; + + /** + Returns the current location of this source. + + @return The current location. + */ + SourceLocation getLocation(); + + /** + Returns a new {@link SourceChar} object at the current location of this + source or EOF. + +

This method does not change the current location either if the + return value represents EOF or is the object that has been pushed + back. Otherwise, the current location proceeds the next one.

+ +

This method may read two or more characters from the upstream + {@link Source} or {@link Reader}. It may also replace one or more + {@link SourceChar} objects with another {@link SourceChar} object and + return it.

+ +

The return value representing EOF may differ from + {@link SourceChar#STATIC_EOF}. Do not compare it with + {@link SourceChar#STATIC_EOF}, use {@link SourceChar#isEof()} method + instead.

+ + @return The new {@link SourceChar} object. + @throws IOException If an I/O error occurs. + @see SourceChar#isEof() + */ + SourceChar getChar() throws IOException; + + /** + Pushes back the specified {@link SourceChar} object. + +

The {@link SourceChar} object to push back must not represent + EOF.

+ +

This method does not change the current location.

+ + @param c The {@link SourceChar} object to push back. + @throws IllegalArgumentException If the object represents EOF. + */ + void ungetChar(SourceChar c); +} diff --git a/src/main/java/com/maroontress/clione/impl/SourceChars.java b/src/main/java/com/maroontress/clione/impl/SourceChars.java new file mode 100644 index 0000000..2c4b0a9 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/SourceChars.java @@ -0,0 +1,213 @@ +package com.maroontress.clione.impl; + +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.SourceLocation; +import com.maroontress.clione.SourceSpan; + +/** + The factory of {@link SourceChar} objects. +*/ +public final class SourceChars { + + /** Prevents the class from being instantiated. */ + private SourceChars() { + throw new AssertionError(); + } + + /** + Returns EOF. + + @return EOF (that is {@link SourceChar#STATIC_EOF}). + */ + public static SourceChar eof() { + return SourceChar.STATIC_EOF; + } + + /** + Returns a new {@link SourceChar} object representing EOF with the + specified child characters. + +

In some cases, EOF can have its child characters. For example, the + EOF that follows a backslash followed by a newline character has to + have them as its child characters, as follows:

+
+        int a = 0;\[NL]
+        [EOF]
+

where {@code [NL]} and {@code [EOF]} represent a newline character + and EOF, respectively.

+ +

The object this method returns behaves as follows:

+
    +
  • The {@link SourceChar#isEof()} method returns {@code true}
  • +
  • The {@link SourceChar#toChar()} method throw an + {@link IllegalStateException}
  • +
  • The {@link SourceChar#getSpan()} method returns the span of the + specified child characters
  • +
  • The {@link SourceChar#getChildren()} method returns the specified + child characters
  • +
+ +

Note that the EOF this method returns is an immutable object.

+ + @param children The non-empty collection containing the child + characters. + @return The new EOF. + @throws IllegalArgumentException If the {@code children} is empty. + */ + public static SourceChar eof(Collection children) { + if (children.isEmpty()) { + throw new IllegalArgumentException("children must not be empty"); + } + var list = List.copyOf(children); + var start = list.get(0).getSpan(); + var end = list.get(list.size() - 1).getSpan(); + var span = new SourceSpan(start, end); + return new Eof() { + @Override + public SourceSpan getSpan() { + return span; + } + + @Override + public List getChildren() { + return list; + } + }; + } + + /** + Returns a new {@link SourceChar} object that has the specified child + characters. + +

Typically, this method is to create the character that follows a + backslash followed by a newline character.

+ +

Note that the character this method returns is an immutable + object.

+ + @param children The non-empty collection containing the child + characters other than the last child character. + @param c The character that represents both the last child character + and the new character itself. + @return The new {@link SourceChar} object. + @throws IllegalArgumentException If the {@code children} is empty. + */ + public static SourceChar of(Collection children, + SourceChar c) { + if (children.isEmpty()) { + throw new IllegalArgumentException("children must not be empty"); + } + var list = Stream.concat(children.stream(), Stream.of(c)) + .collect(Collectors.toUnmodifiableList()); + var span = new SourceSpan(list.get(0).getSpan(), c.getSpan()); + return of(c.toChar(), span, list); + } + + /** + Returns a new {@link SourceChar} object that has the specified child + characters. + +

Typically, this method is to create the character which is + substituted for any trigraph sequence.

+ +

Note that the character this method returns is an immutable + object.

+ + @param first The first child character. + @param second The second child character. + @param third The third child character. + @param c The character that represents the new character. + @return The new {@link SourceChar} object. + */ + public static SourceChar of(SourceChar first, SourceChar second, + SourceChar third, char c) { + var start = first.getSpan().getStart(); + var end = third.getSpan().getEnd(); + var span = new SourceSpan(start, end); + return of(c, span, List.of(first, second, third)); + } + + /** + Returns a new {@link SourceChar} object that has the specified child + characters. + +

Typically, this method is to create the character which is + substituted for any digraph sequence.

+ +

Note that the character this method returns is an immutable + object.

+ + @param first The first child character. + @param second The second child character. + @param c The character that represents the new character. + @return The new {@link SourceChar} object. + */ + public static SourceChar of(SourceChar first, SourceChar second, char c) { + var start = first.getSpan().getStart(); + var end = second.getSpan().getEnd(); + var span = new SourceSpan(start, end); + return of(c, span, List.of(first, second)); + } + + /** + Returns a new {@link SourceChar} object that has no child characters + (that is a leaf character). + +

Note that the character this method returns is an immutable + object.

+ + @param c The character that represents the new character. + @param column The column number of the character. + @param line The line number of the character. + @return The new {@link SourceChar} object. + */ + public static SourceChar of(char c, int column, int line) { + var w = new SourceLocation(line, column); + var span = new SourceSpan(w); + return of(c, span, SourceChar.EMPTY_LIST); + } + + /** + Returns a new {@link SourceChar} object that has the specified span + and the specified child characters. + +

Note that the character this method returns is an immutable + object.

+ + @param c The character that represents the new character. + @param span The span that is the range of the characters in the source + file. + @param children The collection containing the child characters, or + {@link SourceChar#EMPTY_LIST}. + @return The new {@link SourceChar} object. + */ + private static SourceChar of(char c, SourceSpan span, + Collection children) { + var list = List.copyOf(children); + return new SourceChar() { + @Override + public boolean isEof() { + return false; + } + + @Override + public char toChar() { + return c; + } + + @Override + public SourceSpan getSpan() { + return span; + } + + @Override + public List getChildren() { + return list; + } + }; + } +} diff --git a/src/main/java/com/maroontress/clione/impl/Switches.java b/src/main/java/com/maroontress/clione/impl/Switches.java new file mode 100644 index 0000000..d1e25d7 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Switches.java @@ -0,0 +1,464 @@ +package com.maroontress.clione.impl; + +import java.util.Set; +import com.maroontress.clione.impl.Case.Mapper; +import com.maroontress.clione.TokenType; + +/** + Provides mappers that associate a character with a tokenizer. +*/ +public final class Switches { + + /** The default mapper. */ + public static final Mapper DEFAULT = Case.newMapper( + // ' ', '\t', or '\n' + Cases.DELIMITER, + // u + LowerU.CASE, + // [LU] + UpperLOrU.CASE, + // " + Cases.STRING_LITERAL, + // ' + Cases.CHARACTER_CONSTANT, + // / + Slash.CASE, + // [_A-Za-z] (except LUu so that the order is important) + Cases.IDENTIFIER, + // '\\' + Backslash.CASE, + // [0-9] + Cases.NUMBER, + // . + Dot.CASE, + // # + Sharp.CASE, + // % + Percent.CASE, + // : + Colon.CASE, + // - + Cases.MINUS, + // ? + Cases.QUESTION, + // [](){},; + Cases.PUNCTUATOR, + // > + Cases.GREATER_THAN, + // > + Cases.LESS_THAN, + // *^!~ + Cases.OPERATOR_FOLLOWED_BY_EQUAL, + // + + Cases.PLUS, + // & + Cases.AND, + // | + Cases.OR); + + /** The mapper used inside preprocessing directives. */ + public static final Mapper DIRECTIVE = Case.newMapper( + // '\n' + Cases.DIRECTIVE_END, + // ' ' or '\t' + Cases.DIRECTIVE_DELIMITER, + // u + LowerU.CASE, + // [LU] + UpperLOrU.CASE, + // " + Cases.STRING_LITERAL, + // ' + Cases.CHARACTER_CONSTANT, + // / + Slash.CASE, + // [_A-Za-z] (except LUu so that the order is important) + Cases.IDENTIFIER, + // '\\' + Backslash.CASE, + // [0-9] + Cases.NUMBER, + // . + Dot.CASE, + // # + SharpInsideDirective.CASE, + // % + PercentInsideDirective.CASE, + // : + Colon.CASE, + // - + Cases.MINUS, + // ? + Cases.QUESTION, + // [](){},; + Cases.PUNCTUATOR, + // > + Cases.GREATER_THAN, + // > + Cases.LESS_THAN, + // *^!~ + Cases.OPERATOR_FOLLOWED_BY_EQUAL, + // + + Cases.PLUS, + // & + Cases.AND, + // | + Cases.OR); + + /** The mapper used inside preprocessing {@code include} directives. */ + public static final Mapper INCLUDE_DIRECTIVE = Case.newMapper( + // '\n' + Cases.DIRECTIVE_END, + // ' ' or '\t' + Cases.DIRECTIVE_DELIMITER, + // u + LowerU.CASE, + // [LU] + UpperLOrU.CASE, + // " + Cases.FILENAME, + // < + Cases.STANDARD_HEADER, + // ' + Cases.CHARACTER_CONSTANT, + // / + Slash.CASE, + // [_A-Za-z] (except LUu so that the order is important) + Cases.IDENTIFIER, + // '\\' + Backslash.CASE, + // [0-9] + Cases.NUMBER, + // . + Dot.CASE, + // # + SharpInsideDirective.CASE, + // % + PercentInsideDirective.CASE, + // : + Colon.CASE, + // - + Cases.MINUS, + // ? + Cases.QUESTION, + // [](){},; + Cases.PUNCTUATOR, + // > + Cases.GREATER_THAN, + // > + Cases.LESS_THAN, + // *^!~ + Cases.OPERATOR_FOLLOWED_BY_EQUAL, + // + + Cases.PLUS, + // & + Cases.AND, + // | + Cases.OR); + + /** Prevents the class from being instantiated. */ + private Switches() { + throw new AssertionError(); + } + + private static Tokenizer newLessOrGreaterThanTokenizer(char first) { + return x -> { + x.readZeroOrOneChar(c -> c == first); + x.readZeroOrOneChar(c -> c == '='); + return TokenType.OPERATOR; + }; + } + + private static Case newFollowingSelfOrEqualCase(char first) { + return Case.of(first, x -> { + x.readZeroOrOneChar(c -> c == first || c == '='); + return TokenType.OPERATOR; + }); + } + + private static Tokenizer newUniversalCharacterNameCase(int m) { + return x -> { + var n = x.readMax(m, Chars::isHexDigit); + if (n < m) { + return TokenType.UNKNOWN; + } + x.readIdentifier(); + return TokenType.IDENTIFIER; + }; + } + + private static class Colon { + public static final Case CASE = Case.of( + ':', TokenType.PUNCTUATOR, + // > + Case.of('>', Digraphs::toRightBracket)); + } + + private static class Sharp { + public static final Case CASE = Case.of( + '#', TokenType.DIRECTIVE, + // # + Case.of('#', x -> TokenType.UNKNOWN)); + } + + private static class Percent { + public static final Case CASE = Case.of( + '%', TokenType.OPERATOR, + // : + ColonAfterPercent.CASE, + // > + Cases.GREATER_THAN_FOLLOWING_PERCENT, + // = + Case.of('=', x -> TokenType.OPERATOR)); + } + + private static class ColonAfterPercent { + public static final Case CASE = Case.of( + ':', Digraphs::toDirective, + // % + PercentAfterPercentColon.CASE); + } + + private static class PercentAfterPercentColon { + public static final Case CASE = Case.of( + '%', TokenType.UNKNOWN, + // : + Case.of(':', Digraphs::toUnknownDoubleNumberSign)); + } + + private static class SharpInsideDirective { + public static final Case CASE = Case.of( + '#', TokenType.OPERATOR, + // # + Case.of('#', x -> TokenType.OPERATOR)); + } + + private static class PercentInsideDirective { + public static final Case CASE = Case.of( + '%', TokenType.OPERATOR, + // : + ColonAfterPercentInsideDirective.CASE, + // > + Cases.GREATER_THAN_FOLLOWING_PERCENT, + // = + Case.of('=', x -> TokenType.OPERATOR)); + } + + private static class ColonAfterPercentInsideDirective { + public static final Case CASE = Case.of( + ':', Digraphs::toStringificationOperator, + // % + PercentAfterPercentColonInsideDirective.CASE); + } + + private static class PercentAfterPercentColonInsideDirective { + public static final Case CASE = Case.of( + '%', TokenType.UNKNOWN, + // : + Case.of(':', Digraphs::toTokenPastingOperator)); + } + + private static class Backslash { + // Unicode non-digit character + public static final Case CASE = Case.of( + '\\', TokenType.UNKNOWN, + // u + Cases.LOWER_U_AFTER_BACKSLASH, + // U + Cases.UPPER_U_AFTER_BACKSLASH); + } + + private static class Slash { + // Comment (/*...*/ //...) or operator (/ /=) + public static final Case CASE = Case.of( + '/', TokenType.OPERATOR, + // * + Cases.ASTERISK_AFTER_SLASH, + // / + Cases.DOUBLE_SLASH, + // = + Case.of('=', x -> TokenType.OPERATOR)); + } + + private static class EightAfterLowerU { + // UTF-8 string literal or identifier + public static final Case CASE = Case.of( + '8', TokenType.IDENTIFIER, + // [_A-Za-z0-9] + Cases.IDENTIFIER_AFTER_PREFIX, + // " + Cases.STRING_LITERAL); + } + + private static class LowerU { + // Prefix (u8 u) or identifier + public static final Case CASE = Case.of( + 'u', TokenType.IDENTIFIER, + // 8 + EightAfterLowerU.CASE, + // [_A-Za-z0-9] (except 8) + Cases.IDENTIFIER_AFTER_PREFIX, + // " + Cases.STRING_LITERAL, + // ' + Cases.CHARACTER_CONSTANT); + } + + private static class UpperLOrU { + // Prefix (L U) or identifier + public static final Case CASE = Case.of( + Set.of('L', 'U'), TokenType.IDENTIFIER, + // [_A-Za-z0-9] + Cases.IDENTIFIER_AFTER_PREFIX, + // " + Cases.STRING_LITERAL, + // ' + Cases.CHARACTER_CONSTANT); + } + + private static class Dot { + // Preprocessing number starting with a dot (.[0-9]+) or operator + public static final Case CASE = Case.of( + '.', TokenType.OPERATOR, + // 0-9 + Cases.NUMBER, + // . + Cases.DOUBLE_DOT); + } + + /** Leaf cases. */ + private static class Cases { + public static final Case GREATER_THAN_FOLLOWING_PERCENT = Case.of( + '>', Digraphs::toRightBrace); + + // + ++ += + public static final Case PLUS = newFollowingSelfOrEqualCase('+'); + + // - -- -= + public static final Case AND = newFollowingSelfOrEqualCase('&'); + + // & && &= + public static final Case OR = newFollowingSelfOrEqualCase('|'); + + // X X= + public static final Case OPERATOR_FOLLOWED_BY_EQUAL = Case.of( + Set.of('*', '^', '!', '~', '='), x -> { + x.readZeroOrOneChar(c -> c == '='); + return TokenType.OPERATOR; + }); + + // < << <= <<= <: <% + public static final Case LESS_THAN = Case.of( + '<', newLessOrGreaterThanTokenizer('<'), + // : + Case.of(':', Digraphs::toLeftBracket), + // % + Case.of('%', Digraphs::toLeftBrace)); + + // > >> >= >>= + public static final Case GREATER_THAN = Case.of( + '>', newLessOrGreaterThanTokenizer('>')); + + // [ ] ( ) { } , ; + public static final Case PUNCTUATOR = Case.of( + Set.of('[', ']', '(', ')', '{', '}', ',', ';'), + TokenType.PUNCTUATOR); + + // ? + public static final Case QUESTION = Case.of('?', TokenType.OPERATOR); + + // - -> -- -= + public static final Case MINUS = Case.of( + '-', TokenType.OPERATOR, + // > - = + Case.of(Set.of('>', '-', '='), x -> TokenType.OPERATOR)); + + public static final Case DELIMITER = Case.of( + Chars.DELIMITER_SET, x -> { + x.readZeroOrMoreChars(Chars::isDelimiter); + return TokenType.DELIMITER; + }); + + public static final Case NUMBER = Case.of( + Chars.DIGIT_SET, x -> { + x.readNumber(); + return TokenType.NUMBER; + }); + + public static final Case IDENTIFIER = Case.of( + Chars.FIRST_OF_IDENTIFIER_SET, x -> { + x.readIdentifier(); + return TokenType.IDENTIFIER; + }); + + public static final Case CHARACTER_CONSTANT = Case.of( + '\'', x -> { + x.readStringOrCharacter('\''); + return TokenType.CHARACTER; + }); + + public static final Case STRING_LITERAL = Case.of( + '"', x -> { + x.readStringOrCharacter('"'); + return TokenType.STRING; + }); + + public static final Case DIRECTIVE_DELIMITER = Case.of( + Chars.DIRECTIVE_DELIMITER_SET, x -> { + x.readZeroOrMoreChars(Chars::isDirectiveDelimiter); + return TokenType.DELIMITER; + }); + + public static final Case DIRECTIVE_END = Case.of( + '\n', x -> TokenType.DIRECTIVE_END); + + public static final Case STANDARD_HEADER = Case.of( + '<', x -> { + x.readFilename('>'); + return TokenType.STANDARD_HEADER; + }); + + public static final Case FILENAME = Case.of( + '\"', x -> { + x.readFilename('\"'); + return TokenType.FILENAME; + }); + + public static final Case DOUBLE_DOT = Case.of( + '.', x -> { + if (x.readZeroOrOneChar(c -> c == '.') == null) { + // . + var s = x.getSource(); + var b = x.getBuilder(); + s.ungetChar(b.removeLast()); + return TokenType.OPERATOR; + } + // ... + return TokenType.PUNCTUATOR; + }); + + public static final Case ASTERISK_AFTER_SLASH = Case.of( + '*', x -> { + x.readComment(); + return TokenType.COMMENT; + }); + + public static final Case DOUBLE_SLASH = Case.of( + '/', x -> { + x.readSingleLine(); + return TokenType.COMMENT; + }); + + public static final Case IDENTIFIER_AFTER_PREFIX = Case.of( + Chars.IDENTIFIER_SET, x -> { + x.readIdentifier(); + return TokenType.IDENTIFIER; + }); + + public static final Case LOWER_U_AFTER_BACKSLASH = Case.of( + 'u', newUniversalCharacterNameCase(4)); + + public static final Case UPPER_U_AFTER_BACKSLASH = Case.of( + 'U', newUniversalCharacterNameCase(8)); + } +} diff --git a/src/main/java/com/maroontress/clione/impl/TokenBuilder.java b/src/main/java/com/maroontress/clione/impl/TokenBuilder.java new file mode 100644 index 0000000..c3638e1 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/TokenBuilder.java @@ -0,0 +1,158 @@ +package com.maroontress.clione.impl; + +import java.util.ArrayDeque; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.Token; +import com.maroontress.clione.TokenType; + +/** + The builder of {@link Token} objects that has a mutable sequence of + characters. +*/ +public final class TokenBuilder { + + private final ArrayDeque queue; + + /** + Creates a new instance. + */ + public TokenBuilder() { + queue = new ArrayDeque<>(); + } + + /** + Returns the number of characters in this builder. + +

Note that this method does not change this builder.

+ + @return The number of characters in this builder. + */ + public int size() { + return queue.size(); + } + + /** + Appends the specified character to this builder. + + @param c The character to append. + */ + public void append(SourceChar c) { + queue.addLast(c); + } + + /** + Removes the last character in this builder and returns the character. + + @return The character that was the last one in this builder. + */ + public SourceChar removeLast() { + return queue.removeLast(); + } + + /** + Returns the last character in this builder. + +

Note that this method does not change this builder.

+ + @return The last character in this builder. + */ + public SourceChar getLast() { + return queue.getLast(); + } + + /** + Replace the two characters in this builder with the new character + representing the specified {@code char} value. + +

Note that this builder must have just two characters to be + replaced. They are assumed to represent any digraph.

+ +

The new substituted character has two child characters. They + correspond with the characters that were in this builder in the + same order.

+ + @param c The character that is substituted for the two characters + in this builder. + @throws IllegalStateException If the number of characters in this + builder is not two. + */ + public void replaceDigraph(char c) { + if (queue.size() != 2) { + throw new IllegalStateException(); + } + var second = queue.removeLast(); + var first = queue.removeLast(); + queue.addLast(SourceChars.of(first, second, c)); + } + + /** + Replace the four characters in this builder with the new two + characters representing the specified {@code char} values. + +

Note that this builder must have just four characters to be + replaced. They are assumed to represent the digraph + '{@code %:%:}'.

+ +

Each character that is substituted has two child characters. + The child characters of the first substituted character correspond + with the first two characters that were in this builder in the same + order. Likewise, the child characters of the second substituted + character correspond with the second two characters that were in this + builder in the same order.

+ + @param c1 The first character that is substituted for the first two + characters in this builder. + @param c2 The second character that is substituted for the second two + characters in this builder. + @throws IllegalStateException If the number of characters in this + builder is not four. + */ + public void replaceDigraph(char c1, char c2) { + if (queue.size() != 4) { + throw new IllegalStateException(); + } + var fourth = queue.removeLast(); + var third = queue.removeLast(); + var second = queue.removeLast(); + var first = queue.removeLast(); + queue.addLast(SourceChars.of(first, second, c1)); + queue.addLast(SourceChars.of(third, fourth, c2)); + } + + /** + Returns a new token that represents the characters in this builder + with the specified token type. + +

Note that this method does not change this builder.

+ + @param type The token type. + @return The new token. + @throws IllegalStateException If this builder is empty. + */ + public Token toToken(TokenType type) { + if (queue.isEmpty()) { + throw new IllegalStateException(); + } + return new DefaultToken(queue, type); + } + + /** + Returns a new string that represents the characters in this builder. + +

Note that this method does not change this builder.

+ + @return The new string. + @throws IllegalStateException If this builder is empty. + */ + public String toTokenString() { + if (queue.isEmpty()) { + throw new IllegalStateException(); + } + var size = queue.size(); + var b = new StringBuilder(size); + for (var c : queue) { + b.append(c.toChar()); + } + return b.toString(); + } +} diff --git a/src/main/java/com/maroontress/clione/impl/Tokenizer.java b/src/main/java/com/maroontress/clione/impl/Tokenizer.java new file mode 100644 index 0000000..0f871f8 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Tokenizer.java @@ -0,0 +1,44 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.TokenType; + +/** + The function that changes the state of the specified {@link Transcriber} + object, lets it read characters from its source and store a new token in + its builder, and returns the token type of the stored token. + + @see DefaultTokenizer +*/ +@FunctionalInterface +public interface Tokenizer { + + /** + Returns the token type of the token that the specified + {@link Transcriber} object reads. + +

The transcriber reads characters from its source to build a new + token. It stores the building token in its {@link TokenBuilder} + object. So use {@link Transcriber#toToken(TokenType)} method to get + the new token object as follows:

+ +
+        Token newToken(Transcriber x, Tokenizer tokenizer) throws IOException {
+            var type = tokenizer.apply(x);
+            if (type == null) {
+                return null;
+            }
+            return x.toToken(type);
+        }
+ +

Note that this function may return {@code null} unlike + {@link DefaultTokenizer#apply(Transcriber, SourceChar)}.

+ + @param x The transcriber. + @return {@code null} if the transcriber's source has reached EOF. + Otherwise, the token type. + @throws IOException If an I/O error occurs. + */ + TokenType apply(Transcriber x) throws IOException; +} diff --git a/src/main/java/com/maroontress/clione/impl/Transcriber.java b/src/main/java/com/maroontress/clione/impl/Transcriber.java new file mode 100644 index 0000000..671c6fe --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/Transcriber.java @@ -0,0 +1,585 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import java.util.function.Predicate; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.Token; +import com.maroontress.clione.TokenType; +import com.maroontress.clione.impl.Case.Mapper; + +/** + Transcribes a token read from the {@code Source} object into the + {@code TokenBuilder} object. +*/ +public final class Transcriber { + + private static final Map UCN_MAP = Map.of( + 'u', 4, + 'U', 8); + + private static final Set EXP_CHAR_SET = Set.of( + 'E', 'e', 'P', 'p'); + + private static final Set SIGN_CHAR_SET = Set.of('+', '-'); + + private final Source source; + private final TokenBuilder builder; + + /** + Creates a new instance. + + @param source The source that provides the stream of the source file. + */ + public Transcriber(Source source) { + this.source = source; + this.builder = new TokenBuilder(); + } + + /** + Returns the source. + + @return The source. + */ + public Source getSource() { + return source; + } + + /** + Returns the token builder. + + @return The token builder. + */ + public TokenBuilder getBuilder() { + return builder; + } + + /** + Returns a new token with the specified token type. + +

Invocation of this method is equivalent to:

+
getBuilder().toToken(type)
+ + @param type The token type. + @return The new token. + */ + public Token toToken(TokenType type) { + return builder.toToken(type); + } + + /** + Reads a comment from the source. + +

The token builder must have stored the slash and asterisk.

+ +

This method will return when it reads the end of the comment + (asterisk and slash) or reaches EOF.

+ + @throws IOException If an I/O error occurs. + */ + public void readComment() throws IOException { + var s = source; + var b = builder; + for (;;) { + var i = s.getChar(); + if (i.isEof()) { + return; + } + b.append(i); + var c = i.toChar(); + if (c != '*') { + continue; + } + var next = s.getChar(); + if (next.isEof()) { + return; + } + var n = next.toChar(); + if (n == '/') { + b.append(next); + return; + } + s.ungetChar(next); + } + } + + /** + Reads characters from the source until just before a newline + character. + +

This method will return when it reads characters up to just before + the newline character or reaches EOF.

+ + @throws IOException If an I/O error occurs. + */ + public void readSingleLine() throws IOException { + var s = source; + var b = builder; + for (;;) { + var i = s.getChar(); + if (i.isEof()) { + return; + } + var c = i.toChar(); + if (c == '\n') { + s.ungetChar(i); + return; + } + b.append(i); + } + } + + /** + Reads characters from the source up tp the specified terminator + character (including the terminator character). + +

This method will return when it reads the specified terminator + character or reaches EOF.

+ + @param terminator The character that terminates the filename. + @throws IOException If an I/O error occurs. + */ + public void readFilename(char terminator) throws IOException { + var s = source; + var b = builder; + for (;;) { + var i = s.getChar(); + if (i.isEof()) { + return; + } + var c = i.toChar(); + if (c == '\n') { + s.ungetChar(i); + return; + } + b.append(i); + if (c == terminator) { + return; + } + } + } + + /** + Reads characters from the source up tp the specified terminator + character (including the terminator character). + +

The token builder must have stored either single- or double-quote + character.

+ +

This method takes escape sequences into account.

+ +

This method will return when it reads the specified terminator + character (except within an escape sequence) or reaches EOF.

+ +

When this method reaches a newline character, it will return + without reading it.

+ + @param terminator The terminator character. + @throws IOException If an I/O error occurs. + */ + public void readStringOrCharacter(char terminator) throws IOException { + var s = source; + var b = builder; + for (;;) { + var i = s.getChar(); + if (i.isEof()) { + return; + } + var c = i.toChar(); + if (c == '\n') { + s.ungetChar(i); + return; + } + b.append(i); + if (c == terminator) { + return; + } + if (c == '\\') { + readEscapeSequence(); + } + } + } + + private void readEscapeSequence() throws IOException { + var i = source.getChar(); + if (i.isEof()) { + return; + } + builder.append(i); + var c = i.toChar(); + if (Chars.isOctalDigit(c)) { + readMax(2, Chars::isOctalDigit); + return; + } + if (c == 'x') { + readZeroOrMoreChars(Chars::isHexDigit); + return; + } + if (c == 'u') { + readMax(4, Chars::isHexDigit); + return; + } + if (c == 'U') { + readMax(8, Chars::isHexDigit); + return; + } + assert c != '\n'; + } + + /** + Reads at most the specified number of characters while the specified + predicate with the character returns {@code true}. + +

This method will return when it reaches EOF.

+ + @param max The maximum number of characters. + @param accepts The predicate that returns {@code true} if the specified + character is accepted. + @return The number of characters actually read. + @throws IOException If an I/O error occurs. + */ + public int readMax(int max, Predicate accepts) + throws IOException { + var k = 0; + for (; k < max && readZeroOrOneChar(accepts) != null; ++k) { + continue; + } + return k; + } + + /** + Reads an identifier. + +

The token builder must have stored the first character of an + identifier. It may also have stored the second and subsequent + characters of an identifier.

+ +

The second and subsequent character of an identifier must be + either:

+
    +
  • An underscore character, an uppercase or lowercase letter, or a + digit ({@code [_A-Za-z0-9]})
  • +
  • Universal character names ({@code \}{@code uXXXX} or + {@code \}{@code UXXXXXXXX}, {@code X} is a hexadecimal digit)
  • +
  • Other implementation-defined characters
  • +
+ +

The other implementation-defined characters are as + follows:

+
    +
  • The first character: a character with which the + {@link Character#isUnicodeIdentifierStart(int)} method returns + {@code true}
  • +
  • The second and subsequent character: a character with which the + {@link Character#isUnicodeIdentifierPart(int)} + method returns {@code true}
  • +
+ +

This method will return when it reaches EOF.

+ +

When this method reaches a character that is not an identifier, + it will return without reading it.

+ + @throws IOException If an I/O error occurs. + */ + public void readIdentifier() throws IOException { + var s = source; + var b = builder; + for (;;) { + var first = s.getChar(); + if (first.isEof()) { + return; + } + var c = first.toChar(); + if (Chars.isName(c) + || Character.isUnicodeIdentifierPart(c)) { + b.append(first); + continue; + } + if (c == '\\') { + if (!tryReadUcn(first)) { + return; + } + continue; + } + if (Character.isHighSurrogate(c)) { + if (!tryReadSurrogatePair(first)) { + return; + } + continue; + } + s.ungetChar(first); + return; + } + } + + /** + Reads zero or more characters while the specified predicate with the + character returns {@code true}. + +

This method will return when it reaches EOF.

+ + @param accepts The predicate that returns {@code true} if the specified + character is accepted. + @throws IOException If an I/O error occurs. + */ + public void readZeroOrMoreChars(Predicate accepts) + throws IOException { + var s = source; + var b = builder; + for (;;) { + var i = s.getChar(); + if (i.isEof()) { + return; + } + var c = i.toChar(); + if (!accepts.test(c)) { + s.ungetChar(i); + return; + } + b.append(i); + } + } + + /** + Reads at most one character with which the specified predicate + returns {@code true}. + +

This method will return when it reaches EOF.

+ + @param accepts The predicate that returns {@code true} if the specified + character is accepted. + @return The character to have read, or {@code null} if no character + has been read. + @throws IOException If an I/O error occurs. + */ + public SourceChar readZeroOrOneChar(Predicate accepts) + throws IOException { + var s = source; + var b = builder; + var i = s.getChar(); + if (i.isEof()) { + return null; + } + var c = i.toChar(); + if (accepts.test(c)) { + b.append(i); + return i; + } + s.ungetChar(i); + return null; + } + + /** + Reads a preprocessing number. + +

The token builder must have stored the first character of a + preprocessing number. It may also have stored the second and subsequent + characters of a preprocessing number.

+ +

This method will return when it reaches EOF.

+ +

When this method reaches a character that is not a preprocessing + number, it will return without reading it.

+ + @throws IOException If an I/O error occurs. + */ + public void readNumber() throws IOException { + var s = source; + var b = builder; + var last = b.getLast(); + for (;;) { + var i = s.getChar(); + if (i.isEof()) { + return; + } + var c = i.toChar(); + if (Chars.isPreprocessingNumber(c)) { + last = i; + b.append(i); + continue; + } + var prev = last.toChar(); + if (EXP_CHAR_SET.contains(prev) + && SIGN_CHAR_SET.contains(c)) { + last = i; + b.append(i); + continue; + } + s.ungetChar(i); + return; + } + } + + /** + Reads characters according to the specified mapper. + +

This method reads the character that the mapper maps to the + {@link Tokenizer} object and then invokes the tokenizer with + {@code this} transcriber.

+ +

When this method reaches a character and the mapper does not map + the character or reaches EOF, it returns the specified token type + {@code otherwise} without reading any character.

+ + @param mapper The mapper. + @param otherwise The token type that this method returns either + if it reaches a character that {@code mapper} does not map, + or if it reaches EOF. + @return The token type that the tokenizer returns if the {@code mapper} + maps the character to the tokenizer. Otherwise, {@code otherwise}. + @throws IOException If an I/O error occurs. + */ + public TokenType tryReadToken(Mapper mapper, TokenType otherwise) + throws IOException { + var s = source; + var b = builder; + var i = s.getChar(); + if (i.isEof()) { + return otherwise; + } + var c = i.toChar(); + var a = mapper.get(c); + if (a == null) { + s.ungetChar(i); + return otherwise; + } + b.append(i); + return a.apply(this); + } + + /** + Reads characters according to the specified mapper. + +

This method reads the character that the mapper maps to the + {@link Tokenizer} object and then invokes the tokenizer with + {@code this} transcriber.

+ +

When this method reaches a character and the mapper does not map + the character or reaches EOF, it invokes the tokenizer + {@code otherwise} without reading any character.

+ + @param mapper The mapper. + @param otherwise The tokenizer that this method invokes either + if it reaches a character that {@code mapper} does not map, + or if it reaches EOF. + @return The token type that the tokenizer returns if the {@code mapper} + maps the character to the tokenizer. Otherwise, the token type + that the tokenizer {@code otherwise} returns. + @throws IOException If an I/O error occurs. + */ + public TokenType tryReadToken(Mapper mapper, Tokenizer otherwise) + throws IOException { + var s = source; + var b = builder; + var i = s.getChar(); + if (i.isEof()) { + return otherwise.apply(this); + } + var c = i.toChar(); + var a = mapper.get(c); + if (a == null) { + s.ungetChar(i); + return otherwise.apply(this); + } + b.append(i); + return a.apply(this); + } + + /** + Reads characters according to the specified mapper. + +

This method reads the character that the mapper maps to the + {@link Tokenizer} object and then invokes the tokenizer with + {@code this} transcriber.

+ +

When this method reads a character and the mapper does not map + the character, it invokes the default tokenizer {@code otherwise} + with it.

+ +

When this method reaches EOF, it returns {@code null} without + reading any character.

+ + @param mapper The mapper. + @param otherwise The default tokenizer that this method invokes + if it has read a character that {@code mapper} does not map. + @return The token type that the tokenizer returns if the {@code mapper} + maps the character to the tokenizer. The token type + that the default tokenizer {@code otherwise} returns if the + {@code mapper} does not map the character. + {@code null} if this method reaches EOF. + @throws IOException If an I/O error occurs. + */ + public TokenType readTokenOtherwise( + Mapper mapper, DefaultTokenizer otherwise) throws IOException { + var s = source; + var b = builder; + var i = s.getChar(); + if (i.isEof()) { + return null; + } + var c = i.toChar(); + var a = mapper.get(c); + if (a == null) { + return otherwise.apply(this, i); + } + b.append(i); + return a.apply(this); + } + + private boolean tryReadUcn(SourceChar first) throws IOException { + var s = source; + var second = s.getChar(); + if (second.isEof()) { + s.ungetChar(first); + return false; + } + var u = second.toChar(); + var count = UCN_MAP.get(u); + if (count == null) { + s.ungetChar(second); + s.ungetChar(first); + return false; + } + var b = builder; + b.append(first); + b.append(second); + var n = readMax(count, Chars::isHexDigit); + if (n < count) { + rollback(n + 2); + return false; + } + return true; + } + + private boolean tryReadSurrogatePair(SourceChar first) throws IOException { + var s = source; + var c = first.toChar(); + var second = s.getChar(); + if (second.isEof()) { + s.ungetChar(first); + return false; + } + var n = second.toChar(); + if (!Character.isLowSurrogate(n) + || !Character.isUnicodeIdentifierPart( + Character.toCodePoint(c, n))) { + s.ungetChar(first); + s.ungetChar(second); + return false; + } + var b = builder; + b.append(first); + b.append(second); + return true; + } + + private void rollback(int m) { + var s = source; + var b = builder; + for (var k = 0; k < m; ++k) { + s.ungetChar(b.removeLast()); + } + } +} diff --git a/src/main/java/com/maroontress/clione/impl/UnifiedNewlineReader.java b/src/main/java/com/maroontress/clione/impl/UnifiedNewlineReader.java new file mode 100644 index 0000000..e5eb9ec --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/UnifiedNewlineReader.java @@ -0,0 +1,48 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.io.PushbackReader; +import java.io.Reader; +import com.maroontress.clione.LexicalParser; + +/** + This reader substitutes {@code '\n'} for all newlines (LF, CRLF, and CR) + in the stream, even if different newlines are mixed in the stream. + +

Note that the {@link LexicalParser} uses {@code '\n'} as the newline + character.

+*/ +public final class UnifiedNewlineReader extends AbstractReader { + + private final PushbackReader reader; + + /** + Creates a reader unifying newlines. + + @param reader The reader from which characters will be read. + */ + public UnifiedNewlineReader(Reader reader) { + this.reader = new PushbackReader(reader); + } + + /** {@inheritDoc} */ + @Override + public void close() throws IOException { + reader.close(); + } + + /** {@inheritDoc} */ + @Override + public int read() throws IOException { + var c = reader.read(); + if (c != '\r') { + return c; + } + // replace \r\n and \r with \n + var next = reader.read(); + if (next != -1 && next != '\n') { + reader.unread(next); + } + return '\n'; + } +} diff --git a/src/main/java/com/maroontress/clione/impl/package-info.java b/src/main/java/com/maroontress/clione/impl/package-info.java new file mode 100644 index 0000000..77f9d95 --- /dev/null +++ b/src/main/java/com/maroontress/clione/impl/package-info.java @@ -0,0 +1,5 @@ +/** + This package provides the implementation of a lexical parser that must + not be exported outside the module. +*/ +package com.maroontress.clione.impl; diff --git a/src/main/java/com/maroontress/clione/package-info.java b/src/main/java/com/maroontress/clione/package-info.java new file mode 100644 index 0000000..d5012c8 --- /dev/null +++ b/src/main/java/com/maroontress/clione/package-info.java @@ -0,0 +1,99 @@ +/** + This package provides an API of a lexical parser that tokenizes source code + written in C17 and other C-like programming languages. + +

The main facility is a tokenization API corresponding to the C + preprocessor layer. It includes the features of trigraph replacement, line + splicing, and tokenization but does not include macro expansion and + directive handling.

+ +

A typical usage example would be as follows:

+
+    package com.example;
+
+    import java.io.IOException;
+    import java.nio.file.FileSystems;
+    import java.nio.file.Files;
+
+    import com.maroontress.clione.LexicalParser;
+    import com.maroontress.clione.Token;
+
+    public final class TokenDemo {
+
+        public static void main(String[] args) {
+            var path = FileSystems.getDefault().getPath(args[0]);
+            try (var parser = LexicalParser.of(Files.newBufferedReader(path))) {
+                run(parser);
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+
+        public static void run(LexicalParser parser) throws IOException {
+            for (;;) {
+                var maybeToken = parser.next();
+                if (maybeToken.isEmpty()) {
+                    break;
+                }
+                var token = maybeToken.get();
+                printToken(token, "");
+            }
+        }
+
+        public static void printToken(Token token, String indent) {
+            var type = token.getType();
+            var value = token.getValue();
+            var span = token.getSpan();
+            var s = switch (type) {
+                case DELIMITER, DIRECTIVE_END
+                        -> "'" + value.replaceAll("\n", "\\\\n") + "'";
+                default -> value;
+            };
+            System.out.printf("%s%s: %s: %s%n", indent, span, type, s);
+            for (var child : token.getChildren()) {
+                printToken(child, indent + "| ");
+            }
+        }
+    }
+ +

And {@code helloworld.c} would be as follows:

+
+    #include <stdio.h>
+
+    int main(void)
+    {
+        printf("hello world\n");
+    }
+ +

In this example, the result of + "{@code java com.example.TokenDemo helloworld.c}" is as follows:

+ +
+    L1:1--19: DIRECTIVE: #
+    | L1:2--8: DIRECTIVE_NAME: include
+    | L1:9: DELIMITER: ' '
+    | L1:10--18: STANDARD_HEADER: <stdio.h>
+    | L1:19: DIRECTIVE_END: '\n'
+    L2:1: DELIMITER: '\n'
+    L3:1--3: RESERVED: int
+    L3:4: DELIMITER: ' '
+    L3:5--8: IDENTIFIER: main
+    L3:9: PUNCTUATOR: (
+    L3:10--13: RESERVED: void
+    L3:14: PUNCTUATOR: )
+    L3:15: DELIMITER: '\n'
+    L4:1: PUNCTUATOR: {
+    L4:2--L5:4: DELIMITER: '\n    '
+    L5:5--10: IDENTIFIER: printf
+    L5:11: PUNCTUATOR: (
+    L5:12--26: STRING: "hello world\n"
+    L5:27: PUNCTUATOR: )
+    L5:28: PUNCTUATOR: ;
+    L5:29: DELIMITER: '\n'
+    L6:1: PUNCTUATOR: }
+    L6:2: DELIMITER: '\n'
+ + @see com.maroontress.clione.LexicalParser + @see com.maroontress.clione.Token +*/ +package com.maroontress.clione; diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java new file mode 100644 index 0000000..8d79a52 --- /dev/null +++ b/src/main/java/module-info.java @@ -0,0 +1,7 @@ +/** + This module provides the implementation of a lexical parser that tokenizes + source code written in C17 and other C-like programming languages. +*/ +module com.maroontress.clione { + exports com.maroontress.clione; +} diff --git a/src/test/java/com/example/Demo.java b/src/test/java/com/example/Demo.java new file mode 100644 index 0000000..d1ca36b --- /dev/null +++ b/src/test/java/com/example/Demo.java @@ -0,0 +1,33 @@ +package com.example; + +import com.maroontress.clione.LexicalParser; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; + +public final class Demo { + + @Test + public void helloWorld() throws IOException { + TokenDemo.run(of("helloworld.c")); + } + + @Test + public void trigraphAndLineConcat() throws IOException { + SourceCharDemo.run(of("main.c")); + } + + @Test + public void surrogatePair() throws IOException { + SourceCharDemo.run(of("emojicat.c")); + } + + private LexicalParser of(String file) { + var in = getClass().getResourceAsStream(file); + assert in != null; + var charSet = StandardCharsets.UTF_8; + return LexicalParser.of(new InputStreamReader(in, charSet)); + } +} diff --git a/src/test/java/com/example/SourceCharDemo.java b/src/test/java/com/example/SourceCharDemo.java new file mode 100644 index 0000000..492cce5 --- /dev/null +++ b/src/test/java/com/example/SourceCharDemo.java @@ -0,0 +1,61 @@ +package com.example; + +import java.io.IOException; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.util.List; + +import com.maroontress.clione.LexicalParser; +import com.maroontress.clione.SourceChar; +import com.maroontress.clione.Token; + +public final class SourceCharDemo { + + public static void main(String[] args) { + var path = FileSystems.getDefault().getPath(args[0]); + try (var parser = LexicalParser.of(Files.newBufferedReader(path))) { + run(parser); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void run(LexicalParser parser) throws IOException { + for (;;) { + var maybeToken = parser.next(); + if (maybeToken.isEmpty()) { + break; + } + printToken(maybeToken.get()); + } + } + + public static void printToken(Token token) { + var type = token.getType(); + var value = token.getValue(); + var span = token.getSpan(); + var s = switch (type) { + case DELIMITER, DIRECTIVE_END + -> "'" + value.replaceAll("\n", "\\\\n") + "'"; + default -> value; + }; + System.out.printf("%s: %s: %s%n", span, type, s); + printChars(token.getChars(), " "); + } + + private static void printChars(List chars, String indent) { + for (var c : chars) { + var span = c.getSpan(); + var value = c.toChar(); + var s = (value == '\n') + ? "'\\n'" + : Character.isHighSurrogate(value) + ? "H(0x" + Integer.toString((int) value, 16) + ")" + : Character.isLowSurrogate(value) + ? "L(0x" + Integer.toString((int) value, 16) + ")" + : String.valueOf(value); + System.out.printf("%s%s: %s%n", indent, span, s); + printChars(c.getChildren(), indent + "| "); + } + } +} diff --git a/src/test/java/com/example/TokenDemo.java b/src/test/java/com/example/TokenDemo.java new file mode 100644 index 0000000..2ac591b --- /dev/null +++ b/src/test/java/com/example/TokenDemo.java @@ -0,0 +1,46 @@ +package com.example; + +import java.io.IOException; +import java.nio.file.FileSystems; +import java.nio.file.Files; + +import com.maroontress.clione.LexicalParser; +import com.maroontress.clione.Token; + +public final class TokenDemo { + + public static void main(String[] args) { + var path = FileSystems.getDefault().getPath(args[0]); + try (var parser = LexicalParser.of(Files.newBufferedReader(path))) { + run(parser); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void run(LexicalParser parser) throws IOException { + for (;;) { + var maybeToken = parser.next(); + if (maybeToken.isEmpty()) { + break; + } + var token = maybeToken.get(); + printToken(token, ""); + } + } + + public static void printToken(Token token, String indent) { + var type = token.getType(); + var value = token.getValue(); + var span = token.getSpan(); + var s = switch (type) { + case DELIMITER, DIRECTIVE_END + -> "'" + value.replaceAll("\n", "\\\\n") + "'"; + default -> value; + }; + System.out.printf("%s%s: %s: %s%n", indent, span, type, s); + for (var child : token.getChildren()) { + printToken(child, indent + "| "); + } + } +} diff --git a/src/test/java/com/maroontress/clione/LexicalParserTest.java b/src/test/java/com/maroontress/clione/LexicalParserTest.java new file mode 100644 index 0000000..93854ff --- /dev/null +++ b/src/test/java/com/maroontress/clione/LexicalParserTest.java @@ -0,0 +1,826 @@ +package com.maroontress.clione; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Test; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; + +public final class LexicalParserTest { + + @Test + public void delimiters() { + // The escape sequence '\v' in C is not in Java. You can represent a VT + // character in Java with a backslash followed by 'u000b'. + var s = "foo bar\tbaz\fbarbaz\u000bqux"; + var list = List.of(pair("foo", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("bar", TokenType.IDENTIFIER), + pair("\t", TokenType.DELIMITER), + pair("baz", TokenType.IDENTIFIER), + pair("\f", TokenType.DELIMITER), + pair("barbaz", TokenType.IDENTIFIER), + pair("\u000b", TokenType.DELIMITER), + pair("qux", TokenType.IDENTIFIER)); + test(s, list); + } + + @Test + public void identifiers() { + var s = "foo\nbar\nbaz"; + var list = List.of(pair("foo", TokenType.IDENTIFIER), + pair("\n", TokenType.DELIMITER), + pair("bar", TokenType.IDENTIFIER), + pair("\n", TokenType.DELIMITER), + pair("baz", TokenType.IDENTIFIER)); + test(s, list); + } + + @Test + public void punctuators() { + test("[ ] ( ) { } , ; : ...", TokenType.PUNCTUATOR); + } + + @Test + public void withoutKeywords() { + var s = "auto"; + test(s, parser -> { + { + var maybeToken = parser.next(); + assertThat(maybeToken.isPresent(), is(true)); + var token = maybeToken.get(); + assertThat(token.getType(), is(TokenType.IDENTIFIER)); + assertThat(token.getValue(), is(s)); + } + { + var maybeToken = parser.next(); + assertThat(maybeToken.isEmpty(), is(true)); + } + }, Collections.emptySet()); + } + + @Test + public void keywords() { + test("auto break case char const continue default do double " + + "else enum extern float for goto if int long register return " + + "short signed sizeof static struct switch typedef union " + + "unsigned void volatile while " + + "_Bool _Complex _Imaginary inline restrict " + + "_Alignas _Alignof _Atomic _Generic _Noreturn _Static_assert " + + "_Thread_local", TokenType.RESERVED); + } + + @Test + public void digraphReplacement0() { + var s = "%:\n"; + var childList = List.of(pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void digraphReplacement1() { + var s = "<: :> <% %>"; + var list = List.of(pair("[", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("]", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("{", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("}", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void digraphReplacement2() { + var s = "%:define x(a,b) a%:%:b\n"; + var childList = List.of( + pair("define", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("x", TokenType.IDENTIFIER), + pair("(", TokenType.PUNCTUATOR), + pair("a", TokenType.IDENTIFIER), + pair(",", TokenType.PUNCTUATOR), + pair("b", TokenType.IDENTIFIER), + pair(")", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("a", TokenType.IDENTIFIER), + pair("##", TokenType.OPERATOR), + pair("b", TokenType.IDENTIFIER), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void digraphReplacement3() { + var s = "%:#\n"; + var childList = List.of(pair("#", TokenType.OPERATOR), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void digraphReplacement4() { + var s = "#%:\n"; + var childList = List.of(pair("#", TokenType.OPERATOR), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void trigraphReplacement0() { + var s = "??=\n"; + var childList = List.of(pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void trigraphReplacement1() { + var s = "??' ??! ??-"; + var list = List.of(pair("^", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("|", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("~", TokenType.OPERATOR)); + test(s, list); + } + + @Test + public void trigraphReplacement2() { + var s = "??( ??) ??< ??>"; + var list = List.of(pair("[", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("]", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("{", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("}", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void trigraphReplacement3() { + var s = "ma??/\nin '??/'' \"??/\""; + var list = List.of(pair("main", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("'\\''", TokenType.CHARACTER), + pair(" ", TokenType.DELIMITER), + pair("\"\\\"", TokenType.STRING)); + test(s, list); + } + + @Test + public void unknown() { + test("$ @", TokenType.UNKNOWN); + } + + @Test + public void comment() { + var s = "/* foo */"; + var list = List.of(pair(s, TokenType.COMMENT)); + test(s, list); + } + + @Test + public void singleLineComment1() { + var s = "// bar\n"; + var list = List.of( + pair("// bar", TokenType.COMMENT), + pair("\n", TokenType.DELIMITER)); + test(s, list); + } + + @Test + public void singleLineComment2() { + var s = "// baz"; + var list = List.of(pair(s, TokenType.COMMENT)); + test(s, list); + } + + @Test + public void directive0() { + var s = "#define DEBUG\n"; + var childList = List.of( + pair("define", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("DEBUG", TokenType.IDENTIFIER), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void directive1() { + var s = "#define square(x)\\\n" + + " ((x)*(x))\n"; + var childList = List.of( + pair("define", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("square", TokenType.IDENTIFIER), + pair("(", TokenType.PUNCTUATOR), + pair("x", TokenType.IDENTIFIER), + pair(")", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("(", TokenType.PUNCTUATOR), + pair("(", TokenType.PUNCTUATOR), + pair("x", TokenType.IDENTIFIER), + pair(")", TokenType.PUNCTUATOR), + pair("*", TokenType.OPERATOR), + pair("(", TokenType.PUNCTUATOR), + pair("x", TokenType.IDENTIFIER), + pair(")", TokenType.PUNCTUATOR), + pair(")", TokenType.PUNCTUATOR), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void directive2() { + var s = "#if 1\n"; + var childList = List.of( + pair("if", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("1", TokenType.NUMBER), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void directive3() { + var s = "#define x(a,b) a##b\n"; + var childList = List.of( + pair("define", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("x", TokenType.IDENTIFIER), + pair("(", TokenType.PUNCTUATOR), + pair("a", TokenType.IDENTIFIER), + pair(",", TokenType.PUNCTUATOR), + pair("b", TokenType.IDENTIFIER), + pair(")", TokenType.PUNCTUATOR), + pair(" ", TokenType.DELIMITER), + pair("a", TokenType.IDENTIFIER), + pair("##", TokenType.OPERATOR), + pair("b", TokenType.IDENTIFIER), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void includeDirective0() { + var s = "#include/* COMMENT\n" + + "*/// COMMENT\n"; + var childList = List.of( + pair("include", TokenType.DIRECTIVE_NAME), + pair("/* COMMENT\n*/", TokenType.COMMENT), + pair("", TokenType.STANDARD_HEADER), + pair("// COMMENT", TokenType.COMMENT), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void includeDirective1() { + var s = "#/* COMMENT */include \"main.h\"/**/ \n"; + var childList = List.of( + pair("/* COMMENT */", TokenType.COMMENT), + pair("include", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("\"main.h\"", TokenType.FILENAME), + pair("/**/", TokenType.COMMENT), + pair(" ", TokenType.DELIMITER), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void includeDirective2() { + var s = "#include <:a:>\n"; + var childList = List.of( + pair("include", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("<:a:>", TokenType.STANDARD_HEADER), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void includeDirective3() { + var s = "#include <%a%>\n"; + var childList = List.of( + pair("include", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("<%a%>", TokenType.STANDARD_HEADER), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of(pair("#", TokenType.DIRECTIVE, childList)); + test(s, list); + } + + @Test + public void includeDirective4() { + var s = "#define X(x) x.h>\n#include X(", TokenType.OPERATOR), + pair("\n", TokenType.DIRECTIVE_END)); + var includeChildList = List.of( + pair("include", TokenType.DIRECTIVE_NAME), + pair(" ", TokenType.DELIMITER), + pair("X", TokenType.IDENTIFIER), + pair("(", TokenType.PUNCTUATOR), + pair("<", TokenType.OPERATOR), + pair("stdio", TokenType.IDENTIFIER), + pair(")", TokenType.PUNCTUATOR), + pair("\n", TokenType.DIRECTIVE_END)); + var list = List.of( + pair("#", TokenType.DIRECTIVE, defineChildList), + pair("#", TokenType.DIRECTIVE, includeChildList)); + test(s, list); + } + + @Test + public void unterminatedStandardHeader() { + var s = "#include < >= <= " + + "! && || " + + "~ & | ^ << >> " + + "= += -= *= /= %= &= |= ^= <<= >>= " + + ". -> " + + "?", TokenType.OPERATOR); + } + + @Test + public void identifierNotPrefix() { + test("u U L u8", TokenType.IDENTIFIER); + } + + @Test + public void lineConcatenates() { + var s = "ma\\\nin"; + test(s, parser -> { + var maybeToken = parser.next(); + var where = parser.getLocation(); + assertThat(maybeToken.isPresent(), is(true)); + var token = maybeToken.get(); + assertThat(token.getType(), is(TokenType.IDENTIFIER)); + assertThat(token.getValue(), is("main")); + assertThat(token.getSpan().toString(), is("L1:1--L2:2")); + assertThat(where.toString(), is("L2:3")); + + var iList = List.of( + pair('\\', "L1:3"), + pair('\n', "L1:4"), + pair('i', "L2:1")); + var list = List.of( + pair('m', "L1:1"), + pair('a', "L1:2"), + pair('i', "L1:3--L2:1", iList), + pair('n', "L2:2")); + var chars = token.getChars(); + test(chars, list); + }); + } + + @Test + public void lineConcatenatesWithTrigraph() { + var s = "ma??/\nin"; + // 123456 12 + test(s, parser -> { + var maybeToken = parser.next(); + var where = parser.getLocation(); + assertThat(maybeToken.isPresent(), is(true)); + var token = maybeToken.get(); + assertThat(token.getType(), is(TokenType.IDENTIFIER)); + assertThat(token.getValue(), is("main")); + assertThat(token.getSpan().toString(), is("L1:1--L2:2")); + assertThat(where.toString(), is("L2:3")); + + var trigraphList = List.of( + pair('?', "L1:3"), + pair('?', "L1:4"), + pair('/', "L1:5")); + var iList = List.of( + pair('\\', "L1:3--5", trigraphList), + pair('\n', "L1:6"), + pair('i', "L2:1")); + var list = List.of( + pair('m', "L1:1"), + pair('a', "L1:2"), + pair('i', "L1:3--L2:1", iList), + pair('n', "L2:2")); + var chars = token.getChars(); + test(chars, list); + }); + } + + @Test + public void characterConstant() { + var s = "'c' L'w' u'u' U'U'"; + test(s, TokenType.CHARACTER); + } + + @Test + public void unterminatedCharacterConstant() { + var s = "'c\n"; + var list = List.of(pair("'c", TokenType.CHARACTER), + pair("\n", TokenType.DELIMITER)); + test(s, list); + } + + @Test + public void stringLiteral() { + var s = "\"char\" L\"wchar_t\" u\"ucs2\" U\"ucs4\" u8\"utf8\""; + test(s, TokenType.STRING); + } + + @Test + public void unterminatedStringLiteral() { + var s = "\"hello\n"; + var list = List.of(pair("\"hello", TokenType.STRING), + pair("\n", TokenType.DELIMITER)); + test(s, list); + } + + @Test + public void escapeSequenceInChar() { + var s = "'\\u1234' '\\U12345678' '\\1' '\\12' '\\123' '\\x12345' '\\n'"; + test(s, TokenType.CHARACTER); + } + + @Test + public void escapeSequenceInString() { + var s = "\"\\u1234\\U12345678\\1\\12\\123\\x12345\\n\""; + var list = List.of(pair(s, TokenType.STRING)); + test(s, list); + } + + @Test + public void ppNumbers1() { + /* + 0 .123 3E + 123 123E0F 3e+xy + 123LU 0.123E-005 2for1 + */ + test("0 .123 3E 123 123E0F 3e+xy 123LU 0.123E-005 2for1", + TokenType.NUMBER); + } + + @Test + public void ppNumbers2() { + /* + 314 3.14 .314E+1 + 0xa5 .14E+ 1z2z + */ + test("314 3.14 .314E+1 0xa5 .14E+ 1z2z", TokenType.NUMBER); + } + + @Test + public void ppNumbers3() { + test("0x0e+1", TokenType.NUMBER); + } + + @Test + public void ppNumbers4() { + var s = "0x0f+1"; + var list = List.of(pair("0x0f", TokenType.NUMBER), + pair("+", TokenType.OPERATOR), + pair("1", TokenType.NUMBER)); + test(s, list); + } + + @Test + public void ppNumbers5() { + test("0x0.3p10 0x0.3p+10 0x0.3p-10", TokenType.NUMBER); + } + + @Test + public void universalCharacterNameUpperUFirst() { + var s = "char *\\U0001f431s = \"cats\";"; + var list = List.of(pair("char", TokenType.RESERVED), + pair(" ", TokenType.DELIMITER), + pair("*", TokenType.OPERATOR), + pair("\\U0001f431s", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("=", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("\"cats\"", TokenType.STRING), + pair(";", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void universalCharacterNameUpperU() { + var s = "char *big\\U0001f431s = \"bigCats\";"; + var list = List.of(pair("char", TokenType.RESERVED), + pair(" ", TokenType.DELIMITER), + pair("*", TokenType.OPERATOR), + pair("big\\U0001f431s", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("=", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("\"bigCats\"", TokenType.STRING), + pair(";", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void universalCharacterNameLowerUFirst() { + var s = "char *\\u732bs = \"cats\";"; + var list = List.of(pair("char", TokenType.RESERVED), + pair(" ", TokenType.DELIMITER), + pair("*", TokenType.OPERATOR), + pair("\\u732bs", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("=", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("\"cats\"", TokenType.STRING), + pair(";", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void universalCharacterNameLowerU() { + var s = "char *big\\u732bs = \"bigCats\";"; + var list = List.of(pair("char", TokenType.RESERVED), + pair(" ", TokenType.DELIMITER), + pair("*", TokenType.OPERATOR), + pair("big\\u732bs", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("=", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("\"bigCats\"", TokenType.STRING), + pair(";", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void implementationDefinedCharactersFirst() { + var s = "char *็Œซs = \"cats\";"; + var list = List.of(pair("char", TokenType.RESERVED), + pair(" ", TokenType.DELIMITER), + pair("*", TokenType.OPERATOR), + pair("็Œซs", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("=", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("\"cats\"", TokenType.STRING), + pair(";", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void implementationDefinedCharacters() { + var s = "char *big็Œซs = \"bigCats\";"; + var list = List.of(pair("char", TokenType.RESERVED), + pair(" ", TokenType.DELIMITER), + pair("*", TokenType.OPERATOR), + pair("big็Œซs", TokenType.IDENTIFIER), + pair(" ", TokenType.DELIMITER), + pair("=", TokenType.OPERATOR), + pair(" ", TokenType.DELIMITER), + pair("\"bigCats\"", TokenType.STRING), + pair(";", TokenType.PUNCTUATOR)); + test(s, list); + } + + @Test + public void implementationDefinedCharacters0() { + test("ใƒ†ใ‚นใƒˆ _ใƒ†ใ‚นใƒˆ ๐Œ€๐Œ๐Œ‚ _๐Œ€๐Œ๐Œ‚", TokenType.IDENTIFIER); + } + + @Test + public void implementationDefinedCharacters1() { + var s = "๐Œ€๐Œ๐Œ‚"; + test(s, parser -> { + { + var maybeToken = parser.next(); + assertThat(maybeToken.isPresent(), is(true)); + var token = maybeToken.get(); + assertThat(token.getType(), is(TokenType.IDENTIFIER)); + assertThat(token.getValue(), is(s)); + assertThat(token.getSpan().toString(), is("L1:1--3")); + var chars = token.getChars(); + assertThat(chars.size(), is(6)); + var spanList = chars.stream() + .map(c -> c.getSpan().toString()) + .collect(Collectors.toList()); + var expectedList = List.of( + "L1:1", "L1:1", "L1:2", "L1:2", "L1:3", "L1:3"); + assertThat(spanList, is(expectedList)); + } + { + var maybeToken = parser.next(); + assertThat(maybeToken.isEmpty(), is(true)); + } + }); + } + + @Test + public void concatNewlineFollowedByEof() { + var s = "main\\\n\\\n"; + test(s, parser -> { + { + var maybeToken = parser.next(); + assertThat(maybeToken.isPresent(), is(true)); + var token = maybeToken.get(); + assertThat(token.getType(), is(TokenType.IDENTIFIER)); + assertThat(token.getValue(), is("main")); + } + { + var maybeToken = parser.next(); + assertThat(maybeToken.isEmpty(), is(true)); + var maybeEof = parser.getEof(); + assert maybeEof.isPresent(); + var eof = maybeEof.get(); + assert eof.isEof(); + var list = eof.getChildren(); + assertThat(list.size(), is(4)); + assertThat(eof.getSpan().toString(), is("L1:5--L2:2")); + } + }); + } + + private static void test(String s, ParserConsumer consumer) { + var source = new StringReader(s); + test(consumer, () -> LexicalParser.of(source)); + } + + private static void test(String s, ParserConsumer consumer, + Collection keywords) { + var source = new StringReader(s); + test(consumer, () -> LexicalParser.of(source, keywords)); + } + + private static void test(ParserConsumer consumer, + Supplier supplier) { + try (var parser = supplier.get()) { + consumer.accept(parser); + } catch (IOException e) { + throw new AssertionError(); + } + } + + private static void test(String s, List> list) { + test(s, parser -> { + for (var c : list) { + var maybeToken = parser.next(); + assertThat(maybeToken.isPresent(), is(true)); + var token = maybeToken.get(); + c.accept(token); + } + { + var maybeToken = parser.next(); + assertThat(maybeToken.isEmpty(), is(true)); + } + }); + } + + private static void test(String s, TokenType expectedType) { + var expectedList = Arrays.stream(s.split(" ")) + .collect(Collectors.toList()); + test(s, parser -> { + var list = new ArrayList(); + for (;;) { + var maybeToken = parser.next(); + if (maybeToken.isEmpty()) { + break; + } + var token = maybeToken.get(); + var type = token.getType(); + if (type == TokenType.DELIMITER) { + assertThat(token.getValue(), is(" ")); + continue; + } + assertThat(type, is(expectedType)); + list.add(token.getValue()); + } + assertThat(list, equalTo(expectedList)); + }); + } + + private static void test(Collection all, + List> list) { + var i = all.iterator(); + for (var c : list) { + assertThat(i.hasNext(), is(true)); + c.accept(i.next()); + } + assertThat(i.hasNext(), is(false)); + } + + private static Consumer pair(String value, TokenType type) { + return t -> { + assertThat(t.getValue(), is(value)); + assertThat(t.getType(), is(type)); + }; + } + + private static Consumer pair(String value, TokenType type, + List> childList) { + return t -> { + assertThat(t.getValue(), is(value)); + assertThat(t.getType(), is(type)); + var children = t.getChildren(); + var size = children.size(); + assertThat(size, is(childList.size())); + for (var k = 0; k < size; ++k) { + childList.get(k).accept(children.get(k)); + } + }; + } + + private static Consumer pair(char value, String span) { + return c -> { + assertThat(c.toChar(), is(value)); + assertThat(c.getSpan().toString(), is(span)); + assertThat(c.getChildren().isEmpty(), is(true)); + assertThat(c.isEof(), is(false)); + }; + } + + private static Consumer pair(char value, String span, + List> list) { + return c -> { + assertThat(c.toChar(), is(value)); + assertThat(c.getSpan().toString(), is(span)); + var children = c.getChildren(); + assertThat(children.isEmpty(), is(false)); + test(children, list); + assertThat(c.isEof(), is(false)); + }; + } + + @FunctionalInterface + public interface ParserConsumer { + void accept(LexicalParser parser) throws IOException; + } +} diff --git a/src/test/java/com/maroontress/clione/impl/UnifiedNewlineReaderTest.java b/src/test/java/com/maroontress/clione/impl/UnifiedNewlineReaderTest.java new file mode 100644 index 0000000..6a5bbd3 --- /dev/null +++ b/src/test/java/com/maroontress/clione/impl/UnifiedNewlineReaderTest.java @@ -0,0 +1,23 @@ +package com.maroontress.clione.impl; + +import java.io.IOException; +import java.io.StringReader; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public final class UnifiedNewlineReaderTest { + + @Test + public void mixedNewlines() throws IOException { + var s = "a\r\r\n\nb"; + var r = new UnifiedNewlineReader(new StringReader(s)); + var array = new char[5]; + var n = r.read(array, 0, array.length); + assertEquals(array.length, n); + assertArrayEquals(array, "a\n\n\nb".toCharArray()); + var i = r.read(); + assertEquals(-1, i); + } +} diff --git a/src/test/resources/com/example/emojicat.c b/src/test/resources/com/example/emojicat.c new file mode 100644 index 0000000..8373162 --- /dev/null +++ b/src/test/resources/com/example/emojicat.c @@ -0,0 +1 @@ +char *cat = u8"๐Ÿฑ"; diff --git a/src/test/resources/com/example/helloworld.c b/src/test/resources/com/example/helloworld.c new file mode 100644 index 0000000..31cceb4 --- /dev/null +++ b/src/test/resources/com/example/helloworld.c @@ -0,0 +1,6 @@ +#include + +int main(void) +{ + printf("hello world\n"); +} diff --git a/src/test/resources/com/example/main.c b/src/test/resources/com/example/main.c new file mode 100644 index 0000000..b59ba3f --- /dev/null +++ b/src/test/resources/com/example/main.c @@ -0,0 +1,2 @@ +ma??/ +in