use std::borrow::Cow;
use std::hash::Hash;
use std::ops::Range;
pub trait DiffableStrRef {
type Output: DiffableStr + ?Sized;
fn as_diffable_str(&self) -> &Self::Output;
}
impl<T: DiffableStr + ?Sized> DiffableStrRef for T {
type Output = T;
fn as_diffable_str(&self) -> &T {
self
}
}
impl DiffableStrRef for String {
type Output = str;
fn as_diffable_str(&self) -> &str {
self.as_str()
}
}
impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> {
type Output = T;
fn as_diffable_str(&self) -> &T {
self
}
}
pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
fn tokenize_lines(&self) -> Vec<&Self>;
fn tokenize_lines_and_newlines(&self) -> Vec<&Self>;
fn tokenize_words(&self) -> Vec<&Self>;
fn tokenize_chars(&self) -> Vec<&Self>;
#[cfg(feature = "unicode")]
fn tokenize_unicode_words(&self) -> Vec<&Self>;
#[cfg(feature = "unicode")]
fn tokenize_graphemes(&self) -> Vec<&Self>;
fn as_str(&self) -> Option<&str>;
fn to_string_lossy(&self) -> Cow<'_, str>;
fn ends_with_newline(&self) -> bool;
fn len(&self) -> usize;
fn slice(&self, rng: Range<usize>) -> &Self;
fn as_bytes(&self) -> &[u8];
fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl DiffableStr for str {
fn tokenize_lines(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut last_pos = 0;
let mut lines = vec![];
while let Some((idx, c)) = iter.next() {
if c == '\r' {
if iter.peek().map_or(false, |x| x.1 == '\n') {
lines.push(&self[last_pos..=idx + 1]);
iter.next();
last_pos = idx + 2;
} else {
lines.push(&self[last_pos..=idx]);
last_pos = idx + 1;
}
} else if c == '\n' {
lines.push(&self[last_pos..=idx]);
last_pos = idx + 1;
}
}
if last_pos < self.len() {
lines.push(&self[last_pos..]);
}
lines
}
fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
let mut rv = vec![];
let mut iter = self.char_indices().peekable();
while let Some((idx, c)) = iter.next() {
let is_newline = c == '\r' || c == '\n';
let start = idx;
let mut end = idx + c.len_utf8();
while let Some(&(_, next_char)) = iter.peek() {
if (next_char == '\r' || next_char == '\n') != is_newline {
break;
}
iter.next();
end += next_char.len_utf8();
}
rv.push(&self[start..end]);
}
rv
}
fn tokenize_words(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut rv = vec![];
while let Some((idx, c)) = iter.next() {
let is_whitespace = c.is_whitespace();
let start = idx;
let mut end = idx + c.len_utf8();
while let Some(&(_, next_char)) = iter.peek() {
if next_char.is_whitespace() != is_whitespace {
break;
}
iter.next();
end += next_char.len_utf8();
}
rv.push(&self[start..end]);
}
rv
}
fn tokenize_chars(&self) -> Vec<&Self> {
self.char_indices()
.map(move |(i, c)| &self[i..i + c.len_utf8()])
.collect()
}
#[cfg(feature = "unicode")]
fn tokenize_unicode_words(&self) -> Vec<&Self> {
unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect()
}
#[cfg(feature = "unicode")]
fn tokenize_graphemes(&self) -> Vec<&Self> {
unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect()
}
fn as_str(&self) -> Option<&str> {
Some(self)
}
fn to_string_lossy(&self) -> Cow<'_, str> {
Cow::Borrowed(self)
}
fn ends_with_newline(&self) -> bool {
self.ends_with(&['\r', '\n'][..])
}
fn len(&self) -> usize {
str::len(self)
}
fn slice(&self, rng: Range<usize>) -> &Self {
&self[rng]
}
fn as_bytes(&self) -> &[u8] {
str::as_bytes(self)
}
}
#[cfg(feature = "bytes")]
mod bytes_support {
use super::*;
use bstr::ByteSlice;
impl DiffableStrRef for Vec<u8> {
type Output = [u8];
fn as_diffable_str(&self) -> &[u8] {
self.as_slice()
}
}
impl DiffableStr for [u8] {
fn tokenize_lines(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut last_pos = 0;
let mut lines = vec![];
while let Some((_, end, c)) = iter.next() {
if c == '\r' {
if iter.peek().map_or(false, |x| x.2 == '\n') {
lines.push(&self[last_pos..end + 1]);
iter.next();
last_pos = end + 1;
} else {
lines.push(&self[last_pos..end]);
last_pos = end;
}
} else if c == '\n' {
lines.push(&self[last_pos..end]);
last_pos = end;
}
}
if last_pos < self.len() {
lines.push(&self[last_pos..]);
}
lines
}
fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
let mut rv = vec![];
let mut iter = self.char_indices().peekable();
while let Some((start, mut end, c)) = iter.next() {
let is_newline = c == '\r' || c == '\n';
while let Some(&(_, new_end, next_char)) = iter.peek() {
if (next_char == '\r' || next_char == '\n') != is_newline {
break;
}
iter.next();
end = new_end;
}
rv.push(&self[start..end]);
}
rv
}
fn tokenize_words(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut rv = vec![];
while let Some((start, mut end, c)) = iter.next() {
let is_whitespace = c.is_whitespace();
while let Some(&(_, new_end, next_char)) = iter.peek() {
if next_char.is_whitespace() != is_whitespace {
break;
}
iter.next();
end = new_end;
}
rv.push(&self[start..end]);
}
rv
}
#[cfg(feature = "unicode")]
fn tokenize_unicode_words(&self) -> Vec<&Self> {
self.words_with_breaks().map(|x| x.as_bytes()).collect()
}
#[cfg(feature = "unicode")]
fn tokenize_graphemes(&self) -> Vec<&Self> {
self.graphemes().map(|x| x.as_bytes()).collect()
}
fn tokenize_chars(&self) -> Vec<&Self> {
self.char_indices()
.map(move |(start, end, _)| &self[start..end])
.collect()
}
fn as_str(&self) -> Option<&str> {
std::str::from_utf8(self).ok()
}
fn to_string_lossy(&self) -> Cow<'_, str> {
String::from_utf8_lossy(self)
}
fn ends_with_newline(&self) -> bool {
matches!(self.last_byte(), Some(b'\r') | Some(b'\n'))
}
fn len(&self) -> usize {
<[u8]>::len(self)
}
fn slice(&self, rng: Range<usize>) -> &Self {
&self[rng]
}
fn as_bytes(&self) -> &[u8] {
self
}
}
}
#[test]
fn test_split_lines() {
assert_eq!(
DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"),
vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"]
);
assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]);
assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]);
assert!(DiffableStr::tokenize_lines("").is_empty());
}
#[test]
fn test_split_words() {
assert_eq!(
DiffableStr::tokenize_words("foo bar baz\n\n aha"),
["foo", " ", "bar", " ", "baz", "\n\n ", "aha"]
);
}
#[test]
fn test_split_chars() {
assert_eq!(
DiffableStr::tokenize_chars("abcfö❄️"),
vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"]
);
}
#[test]
#[cfg(feature = "unicode")]
fn test_split_graphemes() {
assert_eq!(
DiffableStr::tokenize_graphemes("abcfö❄️"),
vec!["a", "b", "c", "f", "ö", "❄️"]
);
}
#[test]
#[cfg(feature = "bytes")]
fn test_split_lines_bytes() {
assert_eq!(
DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
vec![
"first\n".as_bytes(),
"second\r".as_bytes(),
"third\r\n".as_bytes(),
"fourth\n".as_bytes(),
"last".as_bytes()
]
);
assert_eq!(
DiffableStr::tokenize_lines("\n\n".as_bytes()),
vec!["\n".as_bytes(), "\n".as_bytes()]
);
assert_eq!(
DiffableStr::tokenize_lines("\n".as_bytes()),
vec!["\n".as_bytes()]
);
assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty());
}
#[test]
#[cfg(feature = "bytes")]
fn test_split_words_bytes() {
assert_eq!(
DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()),
[
&b"foo"[..],
&b" "[..],
&b"bar"[..],
&b" "[..],
&b"baz"[..],
&b"\n\n "[..],
&b"aha"[..]
]
);
}
#[test]
#[cfg(feature = "bytes")]
fn test_split_chars_bytes() {
assert_eq!(
DiffableStr::tokenize_chars("abcfö❄️".as_bytes()),
vec![
&b"a"[..],
&b"b"[..],
&b"c"[..],
&b"f"[..],
"ö".as_bytes(),
"❄".as_bytes(),
"\u{fe0f}".as_bytes()
]
);
}
#[test]
#[cfg(all(feature = "bytes", feature = "unicode"))]
fn test_split_graphemes_bytes() {
assert_eq!(
DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()),
vec![
&b"a"[..],
&b"b"[..],
&b"c"[..],
&b"f"[..],
"ö".as_bytes(),
"❄️".as_bytes()
]
);
}