21

Is there a trivial way to split a string keeping the separators? Instead of this:

let texte = "Ten. Million. Questions. Let's celebrate all we've done together.";
let v: Vec<&str> = texte.split(|c: char| !(c.is_alphanumeric() || c == '\'')).filter(|s| !s.is_empty()).collect();

which results with ["Ten", "Million", "Questions", "Let's", "celebrate", "all", "we've", "done", "together"].

I would like something that gives me :

["Ten", ".", " ", "Million", ".", " ", "Questions", ".", " ", "Let's", " ", "celebrate", " ", "all", " ", "we've", " ", "done", " ", "together", "."].

I am trying that kind of code (it assumes the string begins with a letter and ends with a 'non'-letter) :

let texte = "Ten. Million. Questions. Let's celebrate all we've done together.  ";
let v1: Vec<&str> = texte.split(|c: char| !(c.is_alphanumeric() || c == '\'')).filter(|s| !s.is_empty()).collect();
let v2: Vec<&str> = texte.split(|c: char| c.is_alphanumeric() || c == '\'').filter(|s| !s.is_empty()).collect();
let mut w: Vec<&str> = Vec::new();

let mut j = 0;
for i in v2 {
    w.push(v1[j]);
    w.push(i);
    j = j+1;
}

It gives me almost the result I wrote earlier but it's good :

["Ten", ". ", "Million", ". ", "Questions", ". ", "Let's", " ", "celebrate", " ", "all", " ", "we've", " ", "done", " ", "together", "."]

However is there a better way to code that ? Because I tried to enumerate on v2 but it didn't work, and it looks rough to use j in the for loop.

Pierre
  • 1,182
  • 11
  • 15

3 Answers3

12

Using str::match_indices:

let text = "Ten. Million. Questions. Let's celebrate all we've done together.";

let mut result = Vec::new();
let mut last = 0;
for (index, matched) in text.match_indices(|c: char| !(c.is_alphanumeric() || c == '\'')) {
    if last != index {
        result.push(&text[last..index]);
    }
    result.push(matched);
    last = index + matched.len();
}
if last < text.len() {
    result.push(&text[last..]);
}

println!("{:?}", result);

Prints:

["Ten", ".", " ", "Million", ".", " ", "Questions", ".", " ", "Let\'s", " ", "celebrate", " ", "all", " ", "we\'ve", " ", "done", " ", "together", "."]
Shepmaster
  • 388,571
  • 95
  • 1,107
  • 1,366
robinst
  • 30,027
  • 10
  • 102
  • 108
  • I was playing with this, and realized that renaming `matched` to `separator`, made me understand it even better, since what `match_indices()` is looking for is the separators in the question. – Christian Davén Jan 26 '21 at 05:23
9

str::split_inclusive, available since Rust 1.51, returns an iterator keeping the delimiters as part of the matched strings, and may be useful in certain cases:

#[test]
fn split_with_delimiter() {
    let items: Vec<_> = "alpha,beta;gamma"
        .split_inclusive(&[',', ';'][..])
        .collect();
    assert_eq!(&items, &["alpha,", "beta;", "gamma"]);
}

#[test]
fn split_with_delimiter_allows_consecutive_delimiters() {
    let items: Vec<_> = ",;".split_inclusive(&[',', ';'][..]).collect();
    assert_eq!(&items, &[",", ";"]);
}
Shepmaster
  • 388,571
  • 95
  • 1,107
  • 1,366
5

I was not able to find anything in the standard library, so I wrote my own:

This version uses the unstable pattern API as it's more flexible, but the link above has a fallback that I've hardcoded for my specific stable usecase.

#![feature(pattern)]

use std::str::pattern::{Pattern, Searcher};

#[derive(Copy, Clone, Debug, PartialEq)]
pub enum SplitType<'a> {
    Match(&'a str),
    Delimiter(&'a str),
}

pub struct SplitKeepingDelimiter<'p, P>
where
    P: Pattern<'p>,
{
    searcher: P::Searcher,
    start: usize,
    saved: Option<usize>,
}

impl<'p, P> Iterator for SplitKeepingDelimiter<'p, P>
where
    P: Pattern<'p>,
{
    type Item = SplitType<'p>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.start == self.searcher.haystack().len() {
            return None;
        }

        if let Some(end_of_match) = self.saved.take() {
            let s = &self.searcher.haystack()[self.start..end_of_match];
            self.start = end_of_match;
            return Some(SplitType::Delimiter(s));
        }

        match self.searcher.next_match() {
            Some((start, end)) => {
                if self.start == start {
                    let s = &self.searcher.haystack()[start..end];
                    self.start = end;
                    Some(SplitType::Delimiter(s))
                } else {
                    let s = &self.searcher.haystack()[self.start..start];
                    self.start = start;
                    self.saved = Some(end);
                    Some(SplitType::Match(s))
                }
            }
            None => {
                let s = &self.searcher.haystack()[self.start..];
                self.start = self.searcher.haystack().len();
                Some(SplitType::Match(s))
            }
        }
    }
}

pub trait SplitKeepingDelimiterExt: ::std::ops::Index<::std::ops::RangeFull, Output = str> {
    fn split_keeping_delimiter<P>(&self, pattern: P) -> SplitKeepingDelimiter<P>
    where
        P: for<'a> Pattern<'a>,
    {
        SplitKeepingDelimiter {
            searcher: pattern.into_searcher(&self[..]),
            start: 0,
            saved: None,
        }
    }
}

impl SplitKeepingDelimiterExt for str {}

#[cfg(test)]
mod test {
    use super::SplitKeepingDelimiterExt;

    #[test]
    fn split_with_delimiter() {
        use super::SplitType::*;
        let delims = &[',', ';'][..];
        let items: Vec<_> = "alpha,beta;gamma".split_keeping_delimiter(delims).collect();
        assert_eq!(
            &items,
            &[
                Match("alpha"),
                Delimiter(","),
                Match("beta"),
                Delimiter(";"),
                Match("gamma")
            ]
        );
    }

    #[test]
    fn split_with_delimiter_allows_consecutive_delimiters() {
        use super::SplitType::*;
        let delims = &[',', ';'][..];
        let items: Vec<_> = ",;".split_keeping_delimiter(delims).collect();
        assert_eq!(&items, &[Delimiter(","), Delimiter(";")]);
    }
}

You'll note that I needed to track if something was one of the delimiters or not, but that should be easy to adapt if you don't need it.

Shepmaster
  • 388,571
  • 95
  • 1,107
  • 1,366
  • Wow, I need to learn more about Rust to understand that code. However I thought after about splitting the string twice with the pattern to gets words and then the opposite pattern. What do you think about my new code ? – Pierre Aug 28 '15 at 16:16
  • 1
    This will be simpler when [`str::match_indices`](http://doc.rust-lang.org/nightly/std/primitive.str.html#method.match_indices) goes stable. – bluss Aug 28 '15 at 19:35