1

The snippet below comes from this answer.

#include <string>
#include <vector>

void tokenize(std::string str, std::vector<string> &token_v){
    size_t start = str.find_first_not_of(DELIMITER), end=start;

    while (start != std::string::npos){
        // Find next occurence of delimiter
        end = str.find(DELIMITER, start);
        // Push back the token found into vector
        token_v.push_back(str.substr(start, end-start));
        // Skip all occurences of the delimiter to find new start
        start = str.find_first_not_of(DELIMITER, end);
    }
}

Now for a buffer like this:

std::array<char, 150> buffer;

I want to have a sting_view (that points to the buffer) and pass it to the tokenizer function and the tokens should be returned in the form of std::string_views via an out parameter (and not a vector) and also it will return the numbers of tokens that were extracted. The interface looks like this:

size_t tokenize( const std::string_view inputStr,
                 const std::span< std::string_view > foundTokens_OUT,
                 const size_t expectedTokenCount )
{
    // implementation
}

int main( )
{
    std::array<char, 150> buffer { " @a hgs -- " };
    const std::string_view sv { buffer.data( ), buffer.size( ) };
    const size_t expectedTokenCount { 4 };

    std::array< std::string_view, expectedTokenCount > foundTokens; // the span for storing found tokens

    const size_t num_of_found_tokens { tokenize( sv, foundTokens, expectedTokenCount ) };

    if ( num_of_found_tokens == expectedTokenCount )
    {
        // do something
        std::clog << "success\n" << num_of_found_tokens << '\n';
    }

    for ( size_t idx { }; idx < num_of_found_tokens; ++idx )
    {
        std::cout << std::quoted( foundTokens[ idx ] ) << '\n';
    }
}

I would appreciate it if someone could implement a similar tokenize function but for string_view that splits based on space and tab characters. I tried to write one myself but it didn't work as expected (didn't support the tab). Also, I want this function to stop the work and return expectedTokenCount + 1 if the number of tokens found in inputStr exceeds the expectedTokenCount. This is obviously more efficient.

Here is my dummy version:

size_t tokenize( const std::string_view inputStr,
                 const std::span< std::string_view > foundTokens_OUT,
                 const size_t expectedTokenCount )
{
    if ( inputStr.empty( ) )
    {
        return 0;
    }

    size_t start { inputStr.find_first_not_of( ' ' ) };
    size_t end { start };

    size_t foundTokensCount { };

    while ( start != std::string_view::npos && foundTokensCount < expectedTokenCount )
    {
        end = inputStr.find( ' ', start );
        foundTokens_OUT[ foundTokensCount++ ] = inputStr.substr( start, end - start );
        start = inputStr.find_first_not_of( ' ', end );
    }

    return foundTokensCount;
}

Note: The ranges library does not have proper support yet (at least on GCC) so I'm trying to avoid that.

digito_evo
  • 3,216
  • 2
  • 14
  • 42
  • @Some programmer dude Yes but I guess you didn't read my intentions. I do not want to return a vector at all. That's why I am placing the `string_view`s (i.e. tokens) in a `std::array` (`foundTokens`) that lives on the stack of `main`. – digito_evo Mar 13 '22 at 17:03
  • 1
    "*I tried to write one myself but it didn't work as expected (didn't support the tab).*" You should at least show what you tried. – 康桓瑋 Mar 13 '22 at 17:06
  • @康桓瑋 I put it in the answer. Have a look, please. – digito_evo Mar 13 '22 at 17:10
  • So you're only issue with the code you have it that it doesn't support spaces AND tabs? And you want someone to write that code for you... ? – super Mar 13 '22 at 17:21
  • @super Yes, I want a proper implementation or at least some guidance on how to do it. – digito_evo Mar 13 '22 at 17:26
  • Did you know that `find_first_not_of` can accept *character sequence* as the first parameter like `inputStr.find_first_not_of(" \t")`? – 康桓瑋 Mar 13 '22 at 17:28
  • @康桓瑋 Oh ok. I'll try that. – digito_evo Mar 13 '22 at 17:32

2 Answers2

2

I tried to write one myself but it didn't work as expected (didn't support the tab).

If you want to support splitting with spaces and tabs, then you can use another overload of find_first_not_of:

size_type find_first_not_of(const CharT* s, size_type pos = 0) const;

which will finds the first character equal to none of characters in string pointed to by s.

So your implementation only needs to change find_first_not_of(' ') and find(' ') to find_first_not_of(" \t") and find_first_of(" \t").

Demo

康桓瑋
  • 33,481
  • 5
  • 40
  • 90
0

This is my implementation (which I wrote earlier), that can handle things like inputs that start with one or more delimiters, have repeated delimiters and ends one or more delimiters :

It uses string_views for everything, so no memory allocation, but be careful you don't throw away the input strings too early. string_views are after all non-owning.

online demo : https://onlinegdb.com/tytGlOVnk

#include <vector>
#include <string_view>
#include <iostream>

auto tokenize(std::string_view string, std::string_view delimiters)
{
    std::vector<std::string_view> substrings;
    if (delimiters.size() == 0ul)
    {
        substrings.emplace_back(string);
        return substrings;
    }

    auto start_pos = string.find_first_not_of(delimiters);
    auto end_pos = start_pos;
    auto max_length = string.length();

    while (start_pos < max_length)
    {
        end_pos = std::min(max_length, string.find_first_of(delimiters, start_pos));

        if (end_pos != start_pos)
        {
            substrings.emplace_back(&string[start_pos], end_pos - start_pos);
            start_pos = string.find_first_not_of(delimiters, end_pos);
        }
    }

    return substrings;
}

int main()
{
    std::string_view test{ "The, quick! and brown fox. Jumped : over the lazy dog, or did he?" };

    auto tokens = tokenize(test, " ,!.?:");

    for (const auto token : tokens)
    {
        std::cout << token << "\n";
    }

    return 0;
}
Pepijn Kramer
  • 9,356
  • 2
  • 8
  • 19