From d82139cdbccc1b01fd43303f8c56ddf2f2ffa72a Mon Sep 17 00:00:00 2001 From: Andrey Somov Date: Tue, 17 Sep 2024 16:43:41 +0400 Subject: [PATCH] Fix issue 1098: HighSurrogate can be the last char in the data window --- .../yaml/snakeyaml/reader/StreamReader.java | 8 +-- .../snakeyaml/reader/ReaderStringTest.java | 52 +++++++++++++------ 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/yaml/snakeyaml/reader/StreamReader.java b/src/main/java/org/yaml/snakeyaml/reader/StreamReader.java index 199038dff..59346a452 100644 --- a/src/main/java/org/yaml/snakeyaml/reader/StreamReader.java +++ b/src/main/java/org/yaml/snakeyaml/reader/StreamReader.java @@ -175,19 +175,19 @@ private boolean ensureEnoughData(int size) { private void update() { try { - int read = stream.read(buffer, 0, buffer.length - 1); // FIXME why -1 ??? + // read one less because the last char may be HighSurrogate + int read = stream.read(buffer, 0, buffer.length - 1); if (read > 0) { int cpIndex = (dataLength - pointer); dataWindow = Arrays.copyOfRange(dataWindow, pointer, dataLength + read); - if (Character.isHighSurrogate(buffer[read - 1])) { if (stream.read(buffer, read, 1) == -1) { - eof = true; + throw new ReaderException(name, index + read, buffer[read - 1], + "The last char is HighSurrogate (no LowSurrogate detected)."); } else { read++; } } - int nonPrintable = ' '; for (int i = 0; i < read; cpIndex++) { int codePoint = Character.codePointAt(buffer, i); diff --git a/src/test/java/org/yaml/snakeyaml/reader/ReaderStringTest.java b/src/test/java/org/yaml/snakeyaml/reader/ReaderStringTest.java index 6fc8defae..b56d30114 100644 --- a/src/test/java/org/yaml/snakeyaml/reader/ReaderStringTest.java +++ b/src/test/java/org/yaml/snakeyaml/reader/ReaderStringTest.java @@ -13,9 +13,10 @@ */ package org.yaml.snakeyaml.reader; -import java.io.StringReader; import junit.framework.TestCase; +import java.io.StringReader; + public class ReaderStringTest extends TestCase { public void testCheckPrintable() { @@ -40,25 +41,45 @@ public void testCheckNonPrintable() { } /** - * test that Reading date and checking String work the same + * test reading all the chars */ public void testCheckAll() { + int counterSurrogates = 0; for (char i = 0; i < 256 * 256 - 1; i++) { - char[] chars = new char[1]; - chars[0] = i; - String str = new String(chars); - boolean regularExpressionResult = StreamReader.isPrintable(str); + if (Character.isHighSurrogate(i)) { + counterSurrogates++; + } else { + char[] chars = new char[1]; + chars[0] = i; + String str = new String(chars); + boolean regularExpressionResult = StreamReader.isPrintable(str); + + boolean charsArrayResult = true; + try { + new StreamReader(new StringReader(str)).peek(); + } catch (Exception e) { + String error = e.getMessage(); + assertTrue(error, error.startsWith("unacceptable character") + || error.equals("special characters are not allowed")); + charsArrayResult = false; + } + assertEquals("Failed for #" + i, regularExpressionResult, charsArrayResult); + } + } + // https://en.wikipedia.org/wiki/Universal_Character_Set_characters + assertEquals("There are 1024 high surrogates (D800–DBFF)", 1024, counterSurrogates); + } - boolean charsArrayResult = true; - try { - new StreamReader(new StringReader(str)).peek(); - } catch (Exception e) { - String error = e.getMessage(); - assertTrue(error, error.startsWith("unacceptable character") - || error.equals("special characters are not allowed")); - charsArrayResult = false; + public void testHighSurrogateAlone() { + StreamReader reader = new StreamReader("test\uD800"); + try { + while (reader.peek() > 0) { + reader.forward(1); } - assertEquals("Failed for #" + i, regularExpressionResult, charsArrayResult); + } catch (ReaderException e) { + assertTrue(e.toString() + .contains("(0xD800) The last char is HighSurrogate (no LowSurrogate detected)")); + assertEquals(5, e.getPosition()); } } @@ -90,5 +111,4 @@ public void testPeekInt() { assertEquals('s', reader.peek(1)); assertEquals('t', reader.peek(2)); } - }