2

I need to detect in C++ all possible C/C++ string constants:

std::string s = "dummy text"; // comment
std::string s = "dummier text about \"nothing\""; // don't worry
std::string multiLineString = "dummy multiline \
"another line";
std::string s1="aaa", s2="bbb";
std::string multiString="aaa" "bbb";
std::string division="a/b=c";

but also:

char c = '"';
char c = '\t';
char c = '\'';
char c = '\\';

from code above I want to extract:

"dummy text"
"dummier text about \"nothing\""
"dummy multiline \
"aaa"
"a/b=c"
'"'
'\t'
'\''
'\\'

NOTE: I process text line by line, so I need just first string from each line, e.g. only: "dummy multiline \

so first I've tries, then Alan's solution was very useful: Finding quoted strings with escaped quotes in C# using a regular expression
Finally I managed to create program like that:

#include <iostream>
#include <string>
#include <boost/regex.hpp>

boost::regex regex2quotes;

void initRegex()
{
    std::string notDQuota = "((?!\\\\).)*?";
    std::string dQuota = "[\\\"]";

    std::string notSQuota = "((?!\\\\).){1,2}?";
    std::string sQuota = "[']";

    std::string dQuotaExpression = '(' + dQuota + notDQuota + dQuota + ')';
    std::string sQuotaExpression = '(' + sQuota + notSQuota + sQuota + ')';
    std::string finalExpression = dQuotaExpression + '|' + sQuotaExpression;
    std::cout << "Regex>>>>" << finalExpression << "<<<<<\n\n";
    regex2quotes = finalExpression;
}

void checkIfFound(std::string text)
{
    std::cout << "text>>>>>" << text << "<<<\n";

    boost::smatch result;

    bool found = boost::regex_search(text, result, regex2quotes);
    if(found)
        std::cout << "Found====" << result[0] << "====\n";
    else
        std::cout << "!!!Text not found in: " << text << std::endl; 
}

int main(int argc, char *argv[])
{
    initRegex();

    checkIfFound("std::string s = \"dummy text\"; // comment");
    checkIfFound("std::string s = \"dummier text about \\\"nothing\\\"\"; // don't worry");
    checkIfFound("std::string multiLineString = \"dummy \\\n\
                \"another line\";");
    checkIfFound("std::string s1=\"aaa\", s2=\"bbb\";");     
    checkIfFound("std::string multiString=\"aaa\" \"bbb\";");
    checkIfFound("std::string division=\"a/b=c\";");
    checkIfFound("\"text\";");

    checkIfFound("char c = '\"';");
    checkIfFound("char c = '\n';");
    checkIfFound("char c = '\\\'';");
    checkIfFound("char c = '\\\\';");

    return 0;
}

Unfortunately it doesn't extract all test cases I need, output:

Regex>>>>([\"]((?!\\).)*?[\"])|([']((?!\\).){1,2}?['])<<<<<

text>>>>>std::string s = "dummy text"; // comment<<<
Found===="dummy text"====
text>>>>>std::string s = "dummier text about \"nothing\""; // don't worry<<<
Found====""====
text>>>>>std::string multiLineString = "dummy \
                "another line";<<<
Found===="another line"====
text>>>>>std::string s1="aaa", s2="bbb";<<<
Found===="aaa"====
text>>>>>std::string multiString="aaa" "bbb";<<<
Found===="aaa"====
text>>>>>std::string division="a/b=c";<<<
Found===="a/b=c"====
text>>>>>"text";<<<
Found===="text"====
text>>>>>char c = '"';<<<
Found===='"'====
text>>>>>char c = ' ';<<<
Found===='  '====
text>>>>>char c = '\'';<<<
!!!Text not found in: char c = '\'';
text>>>>>char c = '\\';<<<
!!!Text not found in: char c = '\\';

Could you please give me some advices? Is it possible to detect its with regex?

Community
  • 1
  • 1
baziorek
  • 2,502
  • 2
  • 29
  • 43
  • 3
    Parsing C++ with a regex is not going to be fun... – Oliver Charlesworth Jul 25 '15 at 09:06
  • One of your testcases is technically wrong. It should be, when encoded as a string itself: "\"dummy multiline \\\n\"" – autistic Jul 25 '15 at 09:09
  • 1
    What about something [like this](http://goo.gl/uMYzBC) – Jonny 5 Jul 25 '15 at 10:10
  • 1
    @Jonny5 that variable-length lookbehind won't work in C++, it's a .NET feature. – Lucas Trzesniewski Jul 25 '15 at 11:31
  • 1
    You forgot C++11's raw strings `R"foo(hello"world)foo"`. Generally speaking, while it *should* be possible to do this with regex, you'd be better off with a C++ lexer library. – Lucas Trzesniewski Jul 25 '15 at 11:34
  • 2
    I would recommend libtool from clang for that. – Jarod42 Jul 25 '15 at 11:39
  • 1
    Also, it would be very good to know the X in this [XY problem](http://meta.stackexchange.com/questions/66377/what-is-the-xy-problem). I'm having trouble finding an application where this would be really useful – sehe Jul 25 '15 at 22:45
  • @sehe To be honest: in the situation I don't have absolute X. I've tried to find solution, but I couldn't. The problem is usefull in my master thesis, the program has co detect many things and calculate statiscics:) – baziorek Jul 30 '15 at 07:27
  • In that case, if "many things" are at all programing language related, you must use an existing parser/lexer. Libclang gives you a lot of power on top of that (which classes are derived, which declare a user defined destructor, how many functions have more than the loops etc.). Don't be an amateur. Use proper tools! – sehe Jul 30 '15 at 07:41

1 Answers1

1

I've got a working regex for you. The regex is:

(\/\*.*?\*\/)|(\/\/.*$)|"((?:[^"\n\r]|\\.)*)\\$|"((?:\\.|[^"\n\r])*)"|(?:'(\\?.)')

See it at work here

The problem is... I don't know boost good enough to implement it... :S What needs to be done is to do a match and then see if capture groups 1 or 2 are matching. Then it's a comment matching - ignore the match.

If another capture group is matching (3, 4 or 5) it's a "string" constant. (3 is a string ending with a \, 4 is an "ordinary" string and 5 is character.) Then repeat until no match is found.

An improvement from your attempt is that it handles comments - /* ... */ - as well.

I'm not sure how you wan't to handle the multi-line though. The

std::string multiLineString = "dummy multiline \
"another line";

isn't legal c++ code. If it was

std::string multiLineString = "dummy multiline \
another line";

it would be though. But then you can't process the lines individually. You'd have to run the whole code as one single block. But I'm sure you'll figure it out.

Hope this helps.

Regards

EDIT:

Couldn't quite let this one go ;) Here's the code:

#include "stdafx.h"
#include <iostream>
#include <string>
#include <boost/regex.hpp>

using namespace std;
using namespace boost;

string sRE = "(\\/\\*.*?\\*\\/)|(\\/\\/.*$)|\"((?:[^\"\\n\\r]|\\\\.)*)\\\\$|\"((?:\\\\.|[^\"\\n\\r])*)\"|(?:'(\\\\?.)')";

regex re(sRE);

void checkIfFound(string text)
{
    string::const_iterator start = text.begin();
    string::const_iterator end   = text.end();

    smatch what;

    while (regex_search(start, end, what, re))
    {
        for( int idx=3; idx<=5; idx++ )
        {
            if( what[idx].matched )
                cout << "Static text found >>>" << string(what[idx].first, what[idx].second) << "<<<" << endl;
        }
        // Update the beginning of the range to the character
        // following the whole match
        start = what[0].second;
    }
}

int _tmain(int argc, char* argv[])
{
    cout << "Regex:\r\n" << sRE << "\n\n";

    checkIfFound("std::string s = \"dummy text\"; // comment");
    checkIfFound("std::string s = \"dummier text about \\\"nothing\\\"\"; // don't worry");
    checkIfFound("std::string multiLineString = \"dummy \\\n\
                 \"another line\";");
    checkIfFound("std::string s1=\"aaa\", s2=\"bbb\"; /* \"Not a string\" */");     
    checkIfFound("std::string multiString=\"aaa\" \"bbb\";");
    checkIfFound("std::string division=\"a/b=c\";");
    checkIfFound("\"text\";");

    checkIfFound("char c = '\"';");
    checkIfFound("char c = '\n';");
    checkIfFound("char c = '\\\'';");
    checkIfFound("char c = '\\\\';");

    return 0;
}

Output:

Regex:
(\/\*.*?\*\/)|(\/\/.*$)|"((?:[^"\n\r]|\\.)*)\\$|"((?:\\.|[^"\n\r])*)"|(?:'(\\?.)')

Static text found >>>dummy text<<<
Static text found >>>dummier text about \"nothing\"<<<
Static text found >>>dummy <<<
Static text found >>>another line<<<
Static text found >>>aaa<<<
Static text found >>>bbb<<<
Static text found >>>aaa<<<
Static text found >>>bbb<<<
Static text found >>>a/b=c<<<
Static text found >>>text<<<
Static text found >>>"<<<
Static text found >>>
<<<
Static text found >>>\'<<<
Static text found >>>\\<<<
SamWhan
  • 8,296
  • 1
  • 18
  • 45