-4

I've got this function:

void simple_shell::parse_command(char* cmd, char** cmdTokens) {
  // TODO: tokenize the command string into arguments
  char *token = strtok(cmd, " ");
  int count = 0;

  while (token != NULL) {
      count = count + 1;
      cout << token << endl;
      token = strtok(NULL, " ");
  }

  char *secondToken = strtok(cmd, " ");
  *cmdTokens = new char[count];
  count = 0;

  while (secondToken != NULL) {
      cmdTokens[count] = secondToken;
      cout << secondToken << endl;
      secondToken = strtok(NULL, " ");
      count = count + 1;
  }
}

I am basically taking cmd and splitting it with the space as a delimiter. Then, in the while loop, I am printing each token.

It works fine with the first while loop and the first token. If the string is "hello world", then I get "hello" and "world" on two separate lines, which is exactly what I want.

However, in the second loop, something goes wrong and I only get "hello", and nothing else. What is going on here?

Human Cyborg Relations
  • 1,202
  • 5
  • 27
  • 52
  • 2
    Don't use `strtok()` to split strings in c++. It's error prone and unreliable regarding reentrance. Here are some better approaches: https://stackoverflow.com/questions/236129/most-elegant-way-to-split-a-string . Also stop using `char*` and `new` when using c++. – user0042 Oct 04 '17 at 01:36
  • You are aware, of course, that `strtok()` ***modifies*** its buffer parameter. That's how it works. Read its description. So, on the first pass through the string, the `char` buffer gets utterly destroyed, and nuked by `strtok()`. The shown code naively expects `cmd` to be its original pristine self, on the second pass. Unfortunately, that's not going to happen. Better luck next time. – Sam Varshavchik Oct 04 '17 at 01:40
  • *I am basically taking cmd and splitting it with the space as a delimiter.* -- Funny, [I'm doing the same thing](http://coliru.stacked-crooked.com/a/727f7740e1c5c73f). Compare that code to your attempt with `strtok`. – PaulMcKenzie Oct 04 '17 at 01:46

1 Answers1

2

strtok() operates on a null-terminated string as input, but it modifies the string as it goes. When it finds a delimiter in the string, it injects a nul character where the delimiter is located and returns a pointer to the token that is in front of the injected nul. On the next call with a NULL input pointer, the search starts after the previously injected nul.

So, when you run through your first loop, everything is output correctly, but strtok() has altered the contents of cmd to replace all of the space characters with nul characters. So, when you run your second loop, it can't find any space characters in cmd anymore, and stops looking when the nul character after the first token is reached, even though it is not really the last nul in the string.

So, you have a few choices.

You could duplicate cmd on each loop. You will also have to duplicate each token that you store in the output array, since they will be pointing at temporary memory:

int simple_shell::parse_command(char* cmd, char*** cmdTokens)
{
    *cmdTokens = NULL;

    char *dup = strdup(cmd);
    if (!dup)
        return -1;

    int count = 0;

    char *token = strtok(dup, " ");
    while (token)
    {
        ++count;
        token = strtok(NULL, " ");
    }

    free(dup);

    try {
        *cmdTokens = new char*[count];
    }
    catch (const std::bad_alloc&) {
        return -1;
    }

    if (count > 0)
    {
        dup = strdup(cmd);
        if (!dup)
        {
            delete[] *cmdTokens;
            return -1;
        }

        token = strtok(dup, " ");
        count = 0;

        while (token)
        {
            cmdTokens[count] = strdup(token);
            if (!cmdTokens[count])
            {
                for(int i = 0; i < count; ++i)
                    free(cmdTokens[i]);
                delete[] *cmdTokens;
                return -1;
            }

            ++count;
            token = strtok(NULL, " ");
        }
    }

    return count;
}

char** tokens;
int numTokens = shell.parse_command("hello world", &tokens);
if (numTokens != -1)
{
    for (int i = 0; i < numTokens; ++i)
    {
        std::cout << tokens[i] << std::endl;
        free(tokens[i]);
    }
    delete[] tokens;
}

Or, you could stop using strtok() and use a less destructive way to extract substrings. You will still have to duplicate the individual tokens, though, since there won't be any nul characters injected into cmd anymore:

const char* nextToken(const char *str)
{
    if (!str)
        return NULL;

    while (*str == ' ')
        ++str;

    if (*str == '\0')
        return NULL;

    return str;
}

const char* endOfToken(const char *str)
{
    if (!str)
        return NULL;

    while ((*str != ' ') && (*str != '\0'))
        ++str;

    return ptr;
}

int simple_shell::parse_command(char* cmd, char*** cmdTokens)
{
    *cmdTokens = NULL;

    char *ptr = cmd;
    int count = 0;

    const char *token = nextToken(ptr);
    while (token)
    {
        ++count;
        token = nextToken(endOfToken(token));
    }

    try {
        *cmdTokens = new char*[count];
    }
    catch (const std::bad_alloc&) {
        return -1;
    }

    if (count > 0)
    {
        ptr = cmd;
        count = 0;

        token = nextToken(ptr);
        while (token)
        {
            const char *end = endOfToken(token);
            int len = (end-token);

            try {
                cmdTokens[count] = new char[len+1];
            }
            catch (const std::bad_alloc&) {
                for(int i = 0; i < count; ++i)
                    delete[] cmdTokens[i];
                return -1;
            }

            memcpy(cmdTokens[count], token, len);
            cmdTokens[count][len] = '\0';
            ++count;

            token = nextToken(end);
        }
    }

    return count;
}

char** tokens;
int numTokens = shell.parse_command("hello world", &tokens);
if (numTokens != -1)
{
    for (int i = 0; i < numTokens; ++i)
    {
        std::cout << tokens[i] << std::endl;
        delete[] tokens[i];
    }
    delete[] tokens;
}

Or, if you don't like duplicating the tokens, just return pointers to them, as well as their lengths:

struct token_info
{
    char* token;
    int length;
}

const char* nextToken(const char *str)
{
    if (!str)
        return NULL;

    while (*str == ' ')
        ++str;

    if (*str == '\0')
        return NULL;

    return str;
}

const char* endOfToken(const char *str)
{
    if (!str)
        return NULL;

    while ((*str != ' ') && (*str != '\0'))
        ++str;

    return ptr;
}

int simple_shell::parse_command(char* cmd, token_info** cmdTokens)
{
    *cmdTokens = NULL;

    char *ptr = cmd;
    int count = 0;

    const char *token = nextToken(ptr);
    while (token)
    {
        ++count;
        token = nextToken(endOfToken(token));
    }

    try {
        *cmdTokens = new token_info[count];
    }
    catch (const std:bad_alloc&) {
        return -1;
    }

    if (count > 0)
    {
        ptr = cmd;
        count = 0;

        token = nextToken(ptr);
        while (token)
        {
            const char *end = endOfToken(token);

            cmdTokens[count].token = const_cast<char*>(token);
            cmdTokens[count].length = (end-token);
            ++count;

            token = nextToken(end);
        }
    }

    return count;
}

token_info* tokens;
int numTokens = shell.parse_command("hello world", &tokens);
if (numTokens != -1)
{
    for (int i = 0; i < numTokens; ++i)
        std::cout << std::string(tokens[i].token, tokens[i].length) << std::endl;
    delete[] tokens;
}

But, since you are clearly using C++, you should stop using such archaic C-style methods, and use more modern C++ practices instead, eg:

std::vector<std::string> simple_shell::parse_command(const std::string &cmd)
{
    std::istringstream iss(cmd);
    return std::vector<std::string>(
        std::istream_iterator<std::string>(iss),
        std::istream_iterator<std::string>()
    );
}

std::vector<std::string> tokens = shell.parse_command("hello world");
for (auto &token: tokens)
    std::cout << token << std::endl;
Remy Lebeau
  • 555,201
  • 31
  • 458
  • 770