2

I am receiving a data from a Google Language Translator service and need help splitting the data.

void Start()
{
    translateText("Hello, This is a test!", "en", "fr");
}

void translateText(string text, string fromLanguage, string toLanguage)
{
    string url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=" + fromLanguage + "&tl=" + toLanguage + "&dt=t&q=" + Uri.EscapeUriString(text);
    StartCoroutine(startTranslator(url));
}

IEnumerator startTranslator(string url)
{
    UnityWebRequest www = UnityWebRequest.Get(url);
    yield return www.Send();

    Debug.Log("Raw string Received: " + www.downloadHandler.text);

    LanguageResult tempResult = decodeResult(www.downloadHandler.text);

    Debug.Log("Original Text: " + tempResult.originalText);
    Debug.Log("Translated Text: " + tempResult.translatedText);
    Debug.Log("LanguageIso: " + tempResult.languageIso);

    yield return null;
}

LanguageResult decodeResult(string result)
{
    char[] delims = { '[', '\"', ']', ',' };
    string[] arr = result.Split(delims, StringSplitOptions.RemoveEmptyEntries);

    LanguageResult tempLang = null;
    if (arr.Length >= 4)
    {
        tempLang = new LanguageResult();
        tempLang.translatedText = arr[0];
        tempLang.originalText = arr[1];
        tempLang.unknowValue = arr[2];
        tempLang.languageIso = arr[3];
    }
    return tempLang;
}

public class LanguageResult
{
    public string translatedText;
    public string originalText;
    public string unknowValue;
    public string languageIso;
}

then calling it with translateText("Hello, This is a test!", "en", "fr"); from the Start() function which converts the English sentence to French with ISO 639-1 Code.

The received data looks like this:

[[["Bonjour, Ceci est un test!","Hello, This is a test!",,,0]],,"en"]

I want to split it like this:

  • Bonjour, Ceci est un test!
  • Hello, This is a test!
  • 0
  • en

and put them into a string array in order.

I currently use this:

char[] delims = { '[', '\"', ']', ',' };
        string[] arr = result.Split(delims, StringSplitOptions.RemoveEmptyEntries);

This works if there is no comma in the received string. If there is a comma, the splitted values are messed up. What's the best way of splitting this?

EDIT:

With Blorgbeard's solution, the final working code is as below. Hopefully, this will help somebody else. This shouldn't be used for commercial purposes but for personal or school project.

void Start()
{
    //translateText("Hello, This is \" / \\ a test !", "en", "fr");
    //translateText("Hello, This is , \\ \" a test !", "en", "fr");
    translateText("Hello, This is a test!", "en", "fr");
}

void translateText(string text, string fromLanguage, string toLanguage)
{
    string url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=" + fromLanguage + "&tl=" + toLanguage + "&dt=t&q=" + Uri.EscapeUriString(text);
    StartCoroutine(startTranslator(url));
}

IEnumerator startTranslator(string url)
{
    UnityWebRequest www = UnityWebRequest.Get(url);
    yield return www.Send();

    Debug.Log("Raw string Received: " + www.downloadHandler.text);

    LanguageResult tempResult = decodeResult(www.downloadHandler.text);
    displayResult(tempResult);
    yield return null;
}

void displayResult(LanguageResult translationResult)
{
    Debug.Log("Original Text: " + translationResult.originalText);
    Debug.Log("Translated Text: " + translationResult.translatedText);
    Debug.Log("LanguageIso: " + translationResult.languageIso);
}

LanguageResult decodeResult(string result)
{
    string[] arr = Decode(result);

    LanguageResult tempLang = null;
    if (arr.Length >= 4)
    {
        tempLang = new LanguageResult();
        tempLang.translatedText = arr[0];
        tempLang.originalText = arr[1];
        tempLang.unknowValue = arr[2];
        tempLang.languageIso = arr[3];
    }
    return tempLang;
}

public class LanguageResult
{
    public string translatedText;
    public string originalText;
    public string unknowValue;
    public string languageIso;
}

private string[] Decode(string input)
{
    List<string> finalResult = new List<string>();

    bool inToken = false;
    bool inString = false;
    bool escaped = false;
    var seps = ",[]\"".ToArray();
    var current = "";
    foreach (var chr in input)
    {
        if (!inString && chr == '"')
        {
            current = "";
            inString = true;
            continue;
        }
        if (inString && !escaped && chr == '"')
        {
            finalResult.Add(current);
            current = "";
            inString = false;
            continue;
        }
        if (inString && !escaped && chr == '\\')
        {
            escaped = true;
            continue;
        }
        if (inString && (chr != '"' || escaped))
        {
            escaped = false;
            current += chr;
            continue;
        }
        if (inToken && seps.Contains(chr))
        {
            finalResult.Add(current);
            current = "";
            inToken = false;
            continue;
        }
        if (!inString && chr == '"')
        {
            inString = true;
            current = "";
            continue;
        }
        if (!inToken && !seps.Contains(chr))
        {
            inToken = true;
            current = "";
        }
        current += chr;
    }
    return finalResult.ToArray();
}
Community
  • 1
  • 1
Programmer
  • 121,791
  • 22
  • 236
  • 328
  • Can the strings also have (escaped) quotes in them? – Blorgbeard Aug 24 '16 at 21:30
  • `Regex.Split` might be the way to go here, then you could specifically disregard `,` followed by a space, etc. – l'L'l Aug 24 '16 at 21:30
  • @Blorgbeard I just checked, yes it can have \" in it. This is so tricky to me. – Programmer Aug 24 '16 at 21:33
  • @l'L'l It would be good if you provide an example. – Programmer Aug 24 '16 at 21:34
  • It's *almost* JSON, you may be able to find a JSON parser that deals with empty array elements. – Blorgbeard Aug 24 '16 at 21:35
  • Yes, it's almost like json except that it has no attribute. I am doing this in Unity and therefore trying my best to avoid external libraries unless there is no other way of doing this. – Programmer Aug 24 '16 at 21:38
  • The following regex can parse this string and handle \" : `"(.+?[^\\])"|(\d+)` – just.ru Aug 24 '16 at 21:40
  • Looks like JSON.Net will actually handle it: https://dotnetfiddle.net/vlswks – Blorgbeard Aug 24 '16 at 21:43
  • @Blorgbeard JSON.Net from NuGet is not compatible with Unity. The ported version is not [free](https://www.assetstore.unity3d.com/en/#!/content/11347) I will likely go with this paid version if there is no other way. Thanks the code sample. – Programmer Aug 24 '16 at 21:50
  • You might also try: http://stackoverflow.com/questions/9573119/how-to-parse-json-without-json-net-library – Blorgbeard Aug 24 '16 at 21:52
  • @Blorgbeard I wish I could but `System.Json` supports .NET >=4.5. Unity is uses <=.NET 3.5. Unity has built in Json serializer called [JsonUtility](https://docs.unity3d.com/ScriptReference/JsonUtility.html) they must have attributes for it to work. @just.ru that didn't work either. – Programmer Aug 24 '16 at 21:57

4 Answers4

4

You could code up a simple parser yourself. Here's one I threw together (could use some cleaning up, but demonstrates the idea):

private static IEnumerable<string> Parse(string input) {
    bool inToken = false;
    bool inString = false;
    bool escaped = false;
    var seps = ",[]\"".ToArray();
    var current = "";
    foreach (var chr in input) {
        if (!inString && chr == '"') {
            current = "";
            inString = true;
            continue;
        }
        if (inString && !escaped && chr == '"') {
            yield return current;
            current = "";
            inString = false;
            continue;
        }
        if (inString && !escaped && chr == '\\') {
            escaped = true;
            continue;
        }
        if (inString && (chr != '"' || escaped)) {
            escaped = false;
            current += chr;
            continue;
        }
        if (inToken && seps.Contains(chr)) {
            yield return current;
            current = "";
            inToken = false;
            continue;
        }
        if (!inString && chr == '"') {
            inString = true;
            current = "";
            continue;
        }
        if (!inToken && !seps.Contains(chr)) {
            inToken = true;
            current = "";
        }
        current += chr;
    }
}

Here's a jsfiddle demo.

Blorgbeard
  • 101,031
  • 48
  • 228
  • 272
  • Nice out of the box thinking here +1 – l'L'l Aug 24 '16 at 22:34
  • Is it possible to make this into a function that returns string array of 4 that contains those result? So instead of `private static IEnumerable Parse(string input)`, it becomes `private string [] Parse(string input)`? – Programmer Aug 24 '16 at 22:39
  • Sure, just wrap it and call `.ToArray()` - or you could add to a `List` instead of `yield return`ing values, and then return the list `.ToArray()`ed at the end. – Blorgbeard Aug 24 '16 at 22:41
  • I did something like [this](http://pastebin.com/D7TUUkS9) but that didn't work. Can you spot the problem? – Programmer Aug 24 '16 at 22:54
  • Yes, `finalResult.Add(current);` should be in the places where I had `yield return current`, not just at the bottom. – Blorgbeard Aug 24 '16 at 23:01
  • Thanks for your help. I spent time to test your solution and it works %100. Character escape works too. Thank you again! – Programmer Aug 25 '16 at 01:05
1

Using Regex.Split you could do something like this for example:

using System;
using System.Text.RegularExpressions;

public class Example
{
   public static void Main()
   {
        var input ="[[[\"Bonjour, Ceci est un test!\",\"Hello, This is a test!\",,,0]],,\"en\"]";
        var parse = Regex.Split(input, "\\[|\\]|[^a-zA-Z ],|\",\"|\"|\"");
        foreach(var item in parse) {
            bool result = !String.IsNullOrEmpty(item) && (Char.IsLetter(item[0]) || Char.IsDigit(item[0]));
            if (result) {
                Console.WriteLine(item);
            }
        }
   }
}

Output:

Bonjour, Ceci est un test!
Hello, This is a test!
0
en

If you want everything that was split you can simply remove the bool check for alphacharacters.

l'L'l
  • 44,951
  • 10
  • 95
  • 146
  • Wow this is so close. The only problem is that it is skipping the `0`. Please take a look at the question again. You will see which `0`. – Programmer Aug 24 '16 at 22:19
  • Still the-same thing I got in your original answer. – Programmer Aug 24 '16 at 22:24
  • @Programmer: See edit ( online example : https://dotnetfiddle.net/00jsNa ) – l'L'l Aug 24 '16 at 22:30
  • I don't why but I am getting the-same result without the `0`. – Programmer Aug 24 '16 at 22:36
  • Yes, that's why I said I don't know why. It works online but not on my side. Probably because of diffferent compiler or .NET version – Programmer Aug 24 '16 at 22:37
  • 3.5, Unity3D. Don't even know the compiler name but I am 100% sure it is different from the one online. Just tired the modification....Still the-same result. – Programmer Aug 24 '16 at 22:40
  • The-same thing. The 0 is the only thing left. FYI I ran out of upvotes but will be able to at 8PM when time resets – Programmer Aug 24 '16 at 22:56
  • @Programmer: Hey no worries... sometimes the order in which the regex pattern is placed makes a difference also. One more way you might try is `var parse = Regex.Split(input, "[^0-9a-zA-Z\" ],[^0-9a-zA-Z]*|[\\]\\[\"]");` - https://dotnetfiddle.net/WrGSwh ... The comma is the main problem in this situation, since it needs to be kept in the string, but also used as a delimiter. – l'L'l Aug 24 '16 at 23:44
  • Funny thing is that when I re-started Unity and the code in your answer works. The bad side is that it fails when I use `\"` in the string. Try it your self. Blorgbeard's answer passed that test. Thank you so much for your help. – Programmer Aug 25 '16 at 01:01
  • 1
    You're welcome! Regex sometimes takes a lot of tinkering with more complex scenarios like you have, but at any rate glad you found a solution. – l'L'l Aug 25 '16 at 01:17
1

Here is a crazy idea - split by " and then by the rest (but won't work if there is " between the "'s)

var s = @"[[[""Bonjour, Ceci est un test!"",""Hello, This is a test!"",,,0]],,""en""]";

var a = s.Split('"').Select((x, i) => (i & 1) > 0 ? new[] { x } : x.Split("[],".ToArray(),  
                     StringSplitOptions.RemoveEmptyEntries)).SelectMany(x => x).ToArray();

Debug.Print(string.Join("|", a)); // "Bonjour, Ceci est un test!|Hello, This is a test!|0|en"
Slai
  • 22,144
  • 5
  • 45
  • 53
0

You can try regex for splitting. I tested with the sample you provided. It results like this.

    var str="[[[\"Bonjour, Ceci est un test!\",\"Hello, This is a test!\",,,0]],,\"en\"]";
    var splitted=Regex.Split(str,@"\[|\]|\,");
    foreach(var split in splitted){
       Console.WriteLine(split );
    }

   "Bonjour Ceci est un test!"
   "Hello This is a test!"
    0
   "en"
omer faruk
  • 350
  • 4
  • 13