From a1e33fa8ca96559563b20b0fcb1112548aefefe0 Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Fri, 16 Oct 2020 20:04:47 +0200 Subject: [PATCH 01/10] Implement LocatedSpan::get_line(). Add a function to get the full input line containing the (start point of the) LocatedSpan. As suggested in #53. --- src/lib.rs | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index f9259b5..6718a44 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -277,6 +277,45 @@ impl LocatedSpan { (column, &before_self[self.offset - (column - 1)..]) } + /// Return the line that contains this LocatedSpan. + /// + /// The `get_column` and `get_utf8_column` functions returns + /// indexes that corresponds to the line returned by this function. + /// + /// ``` + /// # extern crate nom_locate; + /// # extern crate nom; + /// # use nom_locate::LocatedSpan; + /// # use nom::{Slice, FindSubstring}; + /// # + /// # fn main() { + /// let program = LocatedSpan::new( + /// "Hello World!\ + /// \nThis is a multi-line input\ + /// \nthat ends after this line.\n"); + /// let multi = program.find_substring("multi").unwrap(); + /// + /// assert_eq!( + /// program.slice(multi..).get_line(), + /// Some("This is a multi-line input".as_ref()), + /// ); + /// # } + /// ``` + pub fn get_line(&self) -> Option<&[u8]> { + let self_bytes = self.fragment.as_bytes(); + let self_ptr = self_bytes.as_ptr(); + let offset = self.get_column() - 1; + let the_line = unsafe { + assert!( + offset <= isize::max_value() as usize, + "offset is too big" + ); + let line_start_ptr = self_ptr.offset(-(offset as isize)); + slice::from_raw_parts(line_start_ptr, offset + self_bytes.len()) + }; + the_line.split(|c| *c == b'\n').next() + } + /// Return the column index, assuming 1 byte = 1 column. /// /// Use it for ascii text, or use get_utf8_column for UTF8. From c4df14551364c8f395cb1cc8f9049b072e65c141 Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Fri, 16 Oct 2020 22:47:43 +0200 Subject: [PATCH 02/10] Add some tests. --- src/tests.rs | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/src/tests.rs b/src/tests.rs index 537b53e..8e5ad39 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -414,3 +414,96 @@ fn it_should_display_hex() { "00000000\t61 62 63 \tabc\n".to_owned() ); } + +#[test] +fn line_of_empty_span_is_empty() { + assert_eq!( + StrSpan::new("").get_line(), + Some("".as_ref()), + ); +} + +#[test] +fn line_of_single_line_start_is_whole() { + assert_eq!( + StrSpan::new("A single line").get_line(), + Some("A single line".as_ref()), + ); +} +#[test] +fn line_of_single_line_end_is_whole() { + let data = "A single line"; + assert_eq!( + StrSpan::new(data).slice(data.len()..).get_line(), + Some("A single line".as_ref()), + ); +} + +#[test] +fn line_of_start_is_first() { + assert_eq!( + StrSpan::new( + "One line of text\ + \nFollowed by a second\ + \nand a third\n" + ).get_line(), + Some("One line of text".as_ref()), + ); +} + +#[test] +fn line_of_nl_is_before() { + let data = + "One line of text\ + \nFollowed by a second\ + \nand a third\n"; + assert_eq!( + StrSpan::new(data).slice(data.find('\n').unwrap()..).get_line(), + Some("One line of text".as_ref()), + ); +} + +#[test] +fn line_of_end_after_nl_is_empty() { + let data = + "One line of text\ + \nFollowed by a second\ + \nand a third\n"; + assert_eq!( + StrSpan::new(data).slice(data.len()..).get_line(), + Some("".as_ref()), + ); +} + +#[test] +fn line_of_end_no_nl_is_last() { + let data = + "One line of text\ + \nFollowed by a second\ + \nand a third"; + assert_eq!( + StrSpan::new(data).slice(data.len()..).get_line(), + Some("and a third".as_ref()), + ); +} + +#[test] +fn line_for_non_ascii_chars() { + // I don't really know if this Oriya text makes sense. + let data = StrSpan::new( + "Några rader text på Svenska.\ + \nFörra raden var först, den här är i mitten\ + \noch här är sista raden.\n"); + let s = data.slice(data.find_substring("först").unwrap()..); + assert_eq!( + format!( + "{line_no:3}: {line_text}\n {0:>lpos$}^- The match\n", + "", + line_no = s.location_line(), + line_text = core::str::from_utf8(s.get_line().unwrap()).unwrap(), + lpos = s.get_utf8_column(), + ), + " 2: Förra raden var först, den här är i mitten\ + \n ^- The match\n", + ); +} From 65cf3e1e3bb747eb44a42bab62ac15a019dc543c Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Fri, 16 Oct 2020 22:59:48 +0200 Subject: [PATCH 03/10] Remove bogus comment. --- src/tests.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tests.rs b/src/tests.rs index 8e5ad39..b64db46 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -489,7 +489,6 @@ fn line_of_end_no_nl_is_last() { #[test] fn line_for_non_ascii_chars() { - // I don't really know if this Oriya text makes sense. let data = StrSpan::new( "Några rader text på Svenska.\ \nFörra raden var först, den här är i mitten\ From 942d0c96fdd9d7eac487c772e48702d7deb323a1 Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Fri, 16 Oct 2020 23:10:32 +0200 Subject: [PATCH 04/10] No need for get_line() to return Option. --- src/lib.rs | 9 ++++++--- src/tests.rs | 16 ++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6718a44..2120b00 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -297,11 +297,11 @@ impl LocatedSpan { /// /// assert_eq!( /// program.slice(multi..).get_line(), - /// Some("This is a multi-line input".as_ref()), + /// "This is a multi-line input".as_bytes(), /// ); /// # } /// ``` - pub fn get_line(&self) -> Option<&[u8]> { + pub fn get_line(&self) -> &[u8] { let self_bytes = self.fragment.as_bytes(); let self_ptr = self_bytes.as_ptr(); let offset = self.get_column() - 1; @@ -313,7 +313,10 @@ impl LocatedSpan { let line_start_ptr = self_ptr.offset(-(offset as isize)); slice::from_raw_parts(line_start_ptr, offset + self_bytes.len()) }; - the_line.split(|c| *c == b'\n').next() + match memchr::memchr(b'\n', the_line) { + None => the_line, + Some(pos) => &the_line[..pos], + } } /// Return the column index, assuming 1 byte = 1 column. diff --git a/src/tests.rs b/src/tests.rs index b64db46..e0ded8d 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -419,7 +419,7 @@ fn it_should_display_hex() { fn line_of_empty_span_is_empty() { assert_eq!( StrSpan::new("").get_line(), - Some("".as_ref()), + "".as_bytes(), ); } @@ -427,7 +427,7 @@ fn line_of_empty_span_is_empty() { fn line_of_single_line_start_is_whole() { assert_eq!( StrSpan::new("A single line").get_line(), - Some("A single line".as_ref()), + "A single line".as_bytes(), ); } #[test] @@ -435,7 +435,7 @@ fn line_of_single_line_end_is_whole() { let data = "A single line"; assert_eq!( StrSpan::new(data).slice(data.len()..).get_line(), - Some("A single line".as_ref()), + "A single line".as_bytes(), ); } @@ -447,7 +447,7 @@ fn line_of_start_is_first() { \nFollowed by a second\ \nand a third\n" ).get_line(), - Some("One line of text".as_ref()), + "One line of text".as_bytes(), ); } @@ -459,7 +459,7 @@ fn line_of_nl_is_before() { \nand a third\n"; assert_eq!( StrSpan::new(data).slice(data.find('\n').unwrap()..).get_line(), - Some("One line of text".as_ref()), + "One line of text".as_bytes(), ); } @@ -471,7 +471,7 @@ fn line_of_end_after_nl_is_empty() { \nand a third\n"; assert_eq!( StrSpan::new(data).slice(data.len()..).get_line(), - Some("".as_ref()), + "".as_bytes(), ); } @@ -483,7 +483,7 @@ fn line_of_end_no_nl_is_last() { \nand a third"; assert_eq!( StrSpan::new(data).slice(data.len()..).get_line(), - Some("and a third".as_ref()), + "and a third".as_bytes(), ); } @@ -499,7 +499,7 @@ fn line_for_non_ascii_chars() { "{line_no:3}: {line_text}\n {0:>lpos$}^- The match\n", "", line_no = s.location_line(), - line_text = core::str::from_utf8(s.get_line().unwrap()).unwrap(), + line_text = core::str::from_utf8(s.get_line()).unwrap(), lpos = s.get_utf8_column(), ), " 2: Förra raden var först, den här är i mitten\ From 060fd1fcfccbb8a784698567ef93cea7f5a830e4 Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Sat, 17 Oct 2020 09:55:22 +0200 Subject: [PATCH 05/10] The test that uses `format!` requires std. --- src/tests.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests.rs b/src/tests.rs index e0ded8d..fad947b 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -487,6 +487,7 @@ fn line_of_end_no_nl_is_last() { ); } +#[cfg(feature = "std")] #[test] fn line_for_non_ascii_chars() { let data = StrSpan::new( From 1810ec310e4f0fc942fd2c5c668ca853e51c4bba Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Sat, 17 Oct 2020 10:07:09 +0200 Subject: [PATCH 06/10] Some rustfmt. --- src/tests.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/tests.rs b/src/tests.rs index fad947b..afa133b 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -417,10 +417,7 @@ fn it_should_display_hex() { #[test] fn line_of_empty_span_is_empty() { - assert_eq!( - StrSpan::new("").get_line(), - "".as_bytes(), - ); + assert_eq!(StrSpan::new("").get_line(), "".as_bytes()); } #[test] @@ -446,27 +443,28 @@ fn line_of_start_is_first() { "One line of text\ \nFollowed by a second\ \nand a third\n" - ).get_line(), + ) + .get_line(), "One line of text".as_bytes(), ); } #[test] fn line_of_nl_is_before() { - let data = - "One line of text\ + let data = "One line of text\ \nFollowed by a second\ \nand a third\n"; assert_eq!( - StrSpan::new(data).slice(data.find('\n').unwrap()..).get_line(), + StrSpan::new(data) + .slice(data.find('\n').unwrap()..) + .get_line(), "One line of text".as_bytes(), ); } #[test] fn line_of_end_after_nl_is_empty() { - let data = - "One line of text\ + let data = "One line of text\ \nFollowed by a second\ \nand a third\n"; assert_eq!( @@ -477,8 +475,7 @@ fn line_of_end_after_nl_is_empty() { #[test] fn line_of_end_no_nl_is_last() { - let data = - "One line of text\ + let data = "One line of text\ \nFollowed by a second\ \nand a third"; assert_eq!( @@ -493,7 +490,8 @@ fn line_for_non_ascii_chars() { let data = StrSpan::new( "Några rader text på Svenska.\ \nFörra raden var först, den här är i mitten\ - \noch här är sista raden.\n"); + \noch här är sista raden.\n", + ); let s = data.slice(data.find_substring("först").unwrap()..); assert_eq!( format!( From 84ca913cbca63a9468acdc51f59afc29443c652a Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Sun, 18 Oct 2020 00:28:27 +0200 Subject: [PATCH 07/10] Refactor two similar unsafe blocks to one. --- src/lib.rs | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2120b00..b46c0f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -257,17 +257,24 @@ impl LocatedSpan { &self.fragment } - fn get_columns_and_bytes_before(&self) -> (usize, &[u8]) { + fn get_unoffsetted_slice(&self) -> &[u8] { let self_bytes = self.fragment.as_bytes(); let self_ptr = self_bytes.as_ptr(); - let before_self = unsafe { + unsafe { assert!( self.offset <= isize::max_value() as usize, "offset is too big" ); let orig_input_ptr = self_ptr.offset(-(self.offset as isize)); - slice::from_raw_parts(orig_input_ptr, self.offset) - }; + slice::from_raw_parts( + orig_input_ptr, + self.offset + self_bytes.len(), + ) + } + } + + fn get_columns_and_bytes_before(&self) -> (usize, &[u8]) { + let before_self = &self.get_unoffsetted_slice()[..self.offset]; let column = match memchr::memrchr(b'\n', before_self) { None => self.offset + 1, @@ -302,20 +309,11 @@ impl LocatedSpan { /// # } /// ``` pub fn get_line(&self) -> &[u8] { - let self_bytes = self.fragment.as_bytes(); - let self_ptr = self_bytes.as_ptr(); - let offset = self.get_column() - 1; - let the_line = unsafe { - assert!( - offset <= isize::max_value() as usize, - "offset is too big" - ); - let line_start_ptr = self_ptr.offset(-(offset as isize)); - slice::from_raw_parts(line_start_ptr, offset + self_bytes.len()) - }; - match memchr::memchr(b'\n', the_line) { + let column0 = self.get_column() - 1; + let the_line = &self.get_unoffsetted_slice()[self.offset - column0..]; + match memchr::memchr(b'\n', &the_line[column0..]) { None => the_line, - Some(pos) => &the_line[..pos], + Some(pos) => &the_line[..column0 + pos], } } From 6fb916a99affce0b8b8ca68572ce7c538af4437b Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Sun, 18 Oct 2020 13:59:05 +0200 Subject: [PATCH 08/10] Add some disclaimer comments / docs. --- src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index b46c0f5..81396d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -257,6 +257,10 @@ impl LocatedSpan { &self.fragment } + // Attempt to get the "original" data slice back, by extending + // self.fragment backwards by self.offset. + // Note that any bytes truncated from after self.fragment will not + // be recovered. fn get_unoffsetted_slice(&self) -> &[u8] { let self_bytes = self.fragment.as_bytes(); let self_ptr = self_bytes.as_ptr(); @@ -289,6 +293,10 @@ impl LocatedSpan { /// The `get_column` and `get_utf8_column` functions returns /// indexes that corresponds to the line returned by this function. /// + /// Note that if this LocatedSpan ends before the end of the + /// original data, the result of calling `get_line()` will not + /// include any data from after the LocatedSpan. + /// /// ``` /// # extern crate nom_locate; /// # extern crate nom; From 60ea6cba480cd1a751d04dcb4538bb1ca01d5de8 Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Sun, 18 Oct 2020 19:12:44 +0200 Subject: [PATCH 09/10] Rename get_line to get_line_beginning. --- src/lib.rs | 8 ++++---- src/tests.rs | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 81396d7..d668b08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -294,8 +294,8 @@ impl LocatedSpan { /// indexes that corresponds to the line returned by this function. /// /// Note that if this LocatedSpan ends before the end of the - /// original data, the result of calling `get_line()` will not - /// include any data from after the LocatedSpan. + /// original data, the result of calling `get_line_beginning()` + /// will not include any data from after the LocatedSpan. /// /// ``` /// # extern crate nom_locate; @@ -311,12 +311,12 @@ impl LocatedSpan { /// let multi = program.find_substring("multi").unwrap(); /// /// assert_eq!( - /// program.slice(multi..).get_line(), + /// program.slice(multi..).get_line_beginning(), /// "This is a multi-line input".as_bytes(), /// ); /// # } /// ``` - pub fn get_line(&self) -> &[u8] { + pub fn get_line_beginning(&self) -> &[u8] { let column0 = self.get_column() - 1; let the_line = &self.get_unoffsetted_slice()[self.offset - column0..]; match memchr::memchr(b'\n', &the_line[column0..]) { diff --git a/src/tests.rs b/src/tests.rs index afa133b..913b860 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -417,13 +417,13 @@ fn it_should_display_hex() { #[test] fn line_of_empty_span_is_empty() { - assert_eq!(StrSpan::new("").get_line(), "".as_bytes()); + assert_eq!(StrSpan::new("").get_line_beginning(), "".as_bytes()); } #[test] fn line_of_single_line_start_is_whole() { assert_eq!( - StrSpan::new("A single line").get_line(), + StrSpan::new("A single line").get_line_beginning(), "A single line".as_bytes(), ); } @@ -431,7 +431,7 @@ fn line_of_single_line_start_is_whole() { fn line_of_single_line_end_is_whole() { let data = "A single line"; assert_eq!( - StrSpan::new(data).slice(data.len()..).get_line(), + StrSpan::new(data).slice(data.len()..).get_line_beginning(), "A single line".as_bytes(), ); } @@ -444,7 +444,7 @@ fn line_of_start_is_first() { \nFollowed by a second\ \nand a third\n" ) - .get_line(), + .get_line_beginning(), "One line of text".as_bytes(), ); } @@ -457,7 +457,7 @@ fn line_of_nl_is_before() { assert_eq!( StrSpan::new(data) .slice(data.find('\n').unwrap()..) - .get_line(), + .get_line_beginning(), "One line of text".as_bytes(), ); } @@ -468,7 +468,7 @@ fn line_of_end_after_nl_is_empty() { \nFollowed by a second\ \nand a third\n"; assert_eq!( - StrSpan::new(data).slice(data.len()..).get_line(), + StrSpan::new(data).slice(data.len()..).get_line_beginning(), "".as_bytes(), ); } @@ -479,7 +479,7 @@ fn line_of_end_no_nl_is_last() { \nFollowed by a second\ \nand a third"; assert_eq!( - StrSpan::new(data).slice(data.len()..).get_line(), + StrSpan::new(data).slice(data.len()..).get_line_beginning(), "and a third".as_bytes(), ); } @@ -498,7 +498,7 @@ fn line_for_non_ascii_chars() { "{line_no:3}: {line_text}\n {0:>lpos$}^- The match\n", "", line_no = s.location_line(), - line_text = core::str::from_utf8(s.get_line()).unwrap(), + line_text = core::str::from_utf8(s.get_line_beginning()).unwrap(), lpos = s.get_utf8_column(), ), " 2: Förra raden var först, den här är i mitten\ From de713cb5e84d3bf3cb460bdd315a46a51c04cb52 Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Sun, 18 Oct 2020 19:21:18 +0200 Subject: [PATCH 10/10] Add line_begining_may_ot_be_entire_len test. This test documents how `get_line_beginning()` differs from a hypotetical `get_line()` method. --- src/tests.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/tests.rs b/src/tests.rs index 913b860..c07b273 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -484,6 +484,21 @@ fn line_of_end_no_nl_is_last() { ); } +/// This test documents how `get_line_beginning()` differs from +/// a hypotetical `get_line()` method. +#[test] +fn line_begining_may_ot_be_entire_len() { + let data = "One line of text\ + \nFollowed by a second\ + \nand a third"; + let by = "by"; + let pos = data.find_substring(by).unwrap(); + assert_eq!( + StrSpan::new(data).slice(pos..pos+by.len()).get_line_beginning(), + "Followed by".as_bytes(), + ); +} + #[cfg(feature = "std")] #[test] fn line_for_non_ascii_chars() {