1

I'm attempting to match on a substring and key value pair. For example, matching on the string : "\"a,b,c\",,\"$a = test1, $1 = test2, $2 = test2\",3\n" a

should return

a test1
1 test2
2 test2
a
b
c

where

a test1
2 test2

is a Map and a,b,c are List of items.

Below code :

import javafx.util.Pair;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class KeyValuesExtract {

    private Pair<List<String>, Map<String, String>> getKeyValues(final String line) {

        final Pattern quotesPattern = Pattern.compile("\"(.*?)\"");
        final Matcher quotesMatcher = quotesPattern.matcher(line);
        quotesMatcher.find();

        final List<String> vids = Arrays.asList(quotesMatcher.group(0).split(",")).stream().map(x ->
                x.replace("\"", "").trim()).collect(Collectors.toList());

        final Map<String, String> enumKeyValuePairs = new HashMap<>();

        final Pattern keyValuePattern = Pattern.compile("\"\\$([A-Za-z0-9]+)\\s=\\s(\\w+)(?:,\\s\\$([A-Za-z0-9]+)\\s=\\s(\\w+))*\"");
        final Matcher keyValueMatcher = keyValuePattern.matcher(line);
        while (keyValueMatcher.find()) {
            for (int i = 1; i <= keyValueMatcher.groupCount(); i++) {
                enumKeyValuePairs.put(keyValueMatcher.group(i), keyValueMatcher.group(++i));
            }
        }

        return new Pair(vids, enumKeyValuePairs);
    }

    public static void main(String args[]) {

        final String str = "\"a,b,c\",,\"$a = test1, $1 = test2, $2 = test2\",3\n";

        final KeyValuesExtract testCode = new KeyValuesExtract();
        final Pair<List<String>, Map<String, String>> pair = testCode.getKeyValues(str);

        pair.getValue().entrySet().forEach(entry -> {
            System.out.println(entry.getKey() + " " + entry.getValue());
        });

        pair.getKey().forEach(entry -> {
            System.out.println(entry);
        });

    }

}

prints :

a test1
2 test2
a
b
c

From an earlier question : Extracting key value pair from substring within string I've updated the regex from

"\"\\$(\\d+)\\s=\\s(\\w+)(?:,\\s\\$(\\d+)\\s=\\s(\\w+))*\""

to match on both digits and characters :

"\"\\$([A-Za-z0-9]+)\\s=\\s(\\w+)(?:,\\s\\$([A-Za-z0-9]+)\\s=\\s(\\w+))*\""

How to match all the Map values ? :

a test1
1 test2
2 test2
thepen
  • 371
  • 1
  • 11

2 Answers2

2

You can not use a single regular expression to capture multiple groups (see Java regex: Repeating capturing groups).

Really, when your input become more complex you should use some Lexer and Parser.

Anyway, your problem can be solved iterating string twice:

@Getter
@Setter
@AllArgsConstructor
@ToString
static class Result {
    private List<String> items;
    private Map<String, String> map;
}

static Result parse(String str) {
    final Result result = new Result(new ArrayList<>(), new HashMap<>());

    final Pattern find1 = Pattern.compile("(\" *\\p{Alnum}+ *(?:, *\\p{Alnum}+ *)*\")");
    final Pattern extract1 = Pattern.compile("\\p{Alnum}+");
    final Pattern find2 = Pattern.compile("(\" *\\$\\p{Alnum}+ *= *\\p{Alnum}+ *(?:, *\\$\\p{Alnum}+ *= *\\p{Alnum}+ *)*\")");
    final Pattern extract2 = Pattern.compile("\\$(\\p{Alnum}+) *= *(\\p{Alnum}+)");

    final Matcher matcher1 = find1.matcher(str);
    while (matcher1.find()) {
        final Matcher extractor1 = extract1.matcher(matcher1.group(0));
        while(extractor1.find())
            result.items.add(extractor1.group(0));
    }

    final Matcher matcher2 = find2.matcher(str);
    while (matcher2.find()) {
        final Matcher extractor2 = extract2.matcher(matcher2.group(0));
        while(extractor2.find())
            result.map.put(extractor2.group(1), extractor2.group(2));
    }

    return result;
}

public static void main(String... args) {

    // your example
    System.out.println(parse("\"a,b,c\",,\"$a = test1, $1 = test2, $2 = test2\",3\n"));

    // more complex case
    System.out.println(parse("\"a,b,c\",,\"$a = test1, $1 = test2, $2 = test2\",3,\"foo\",bar,\" $er33=33re  \"\n"));

}

with output

Result(items=[a, b, c], map={a=test1, 1=test2, 2=test2})
Result(items=[a, b, c, foo], map={a=test1, 1=test2, 2=test2, er33=33re})

or, if you need separate lists and maps you could do

private List<List<String>> items;
private List<Map<String, String>> map;
...
final Matcher matcher1 = find1.matcher(str);
while (matcher1.find()) {
    final Matcher extractor1 = extract1.matcher(matcher1.group(0));
    final List<String> l = new ArrayList<>();
    while(extractor1.find())
        l.add(extractor1.group(0));
    result.items.add(l);
}

final Matcher matcher2 = find2.matcher(str);
while (matcher2.find()) {
    final Matcher extractor2 = extract2.matcher(matcher2.group(0));
    final Map<String, String> m = new HashMap<>();
    while(extractor2.find())
        m.put(extractor2.group(1), extractor2.group(2));
    result.map.add(m);
}

with output

Result(items=[[a, b, c]], map=[{a=test1, 1=test2, 2=test2}])
Result(items=[[a, b, c], [foo]], map=[{a=test1, 1=test2, 2=test2}, {er33=33re}])

Moreover, if you need to preserve the order you can also do it, with a regular expression (find1|find2) and then applying it to both extract1 and extract2.

josejuan
  • 9,338
  • 24
  • 31
  • thanks for this, if a value for the contains an '_' , for example 'test2_5' : System.out.println(parse("\"a,b,c\",,\"$a = test1, $1 = test2, $2 = test2_5\",3,\"foo\",bar,\" $er33=33re \"\n")); then the pattern does not match. I could replace all _ prior to parsing, can the regex be updated to handle this ? – thepen Aug 08 '21 at 11:32
  • 1
    Of course, you can replace `\\p{Alnum}` by any expression you would to match. E.g. `[_\\p{Alnum}]`. – josejuan Aug 08 '21 at 14:06
0

You can use this regex :

\$[a-zA-Z0-9]+\s*=\s*[a-zA-Z0-9]+[,]*

I tried with your example in https://regex101.com/r/xDX5v7/1 :

"\"a,b,c\",,\"$a = test1, $1 = test2, $2 = test2\",3\n"

and it returned :

$a = test1,
$1 = test2,
$2 = test2

Once you have matched the regex pattern, you can use .groupCount method from Matcher class to get the counts, and then use space as a string delimiter to split the matched strings and put them in a map.

DebashisDeb
  • 392
  • 5
  • 13