0

I have the following xml:

<?xml version="1.0" encoding="utf-8"?>
<parent>
   <element href="www.something.com" title="First">
        <element href="www.something.com" title="Second">
           <element href="www.something.com" title="Third">
           </element>    
        </element>
    </element>    
    <element href="www.something.com" title="Some title"></element>
    <element href="www.something.com" title="Another">
        <element href="www.something.com" title="Extra">
            <element href="www.something.com" title="Page">
                <element href="www.something.com" title="Target">    
                </element>
            </element>    
        </element>
    </element>
</parent>

How can I parse this xml and format the path for each element using the title attribute from parent down to the deepest children?

Each element can have zero, one or more children.

Example:

<element href="www.something.com" title="First"> // path: First
    <element href="www.something.com" title="Second"> // path: First / Second
       <element href="www.something.com" title="Third"> //path: First / Second /Third
       </element>    
    </element>
</element>

<element href="www.something.com" title="Some title"></element> // path: Some title

<element href="www.something.com" title="Another"> // path: Another
    <element href="www.something.com" title="Extra"> // path: Another / Extra
        <element href="www.something.com" title="Page"> // path: Another / Extra / Page
            <element href="www.something.com" title="Target"> //path: Another / Extra / Page / Target
            </element>
        </element>    
    </element>
</element>
John Saunders
  • 160,644
  • 26
  • 247
  • 397
user2818430
  • 5,853
  • 21
  • 82
  • 148
  • 1
    If it doesn't have a root element then its not really a valid Xml document. I'm not sure how you can parse through it since most Xml readers won't like that its not a well formed document. It seems like a good use for XPath, but without a root, it won't work. – Ron Beyer May 14 '15 at 19:43
  • Sorry my bad. There is a root element. I make changes to question. – user2818430 May 14 '15 at 19:48
  • By path, do you mean a valid `XPath` string that you could pass to `XElement.XPathSelectElement`? – dbc May 14 '15 at 22:13

2 Answers2

0

Parse it using LINQ to XML and then build the paths you need. For any given element:

var titles = element.AncestorsAndSelf()
    .SelectMany(e => e.Attributes("title"))
    .Select(a => a.Value)
    .Reverse();

var path = string.Join(" / ", titles);
Charles Mager
  • 25,735
  • 2
  • 35
  • 45
0

If you are looking to generate XPath strings that uniquely specify an element in an XML document based on a predefined attribute name, you can put together a solution based on these two answers:

To create the following extension method:

public static class XExtensions
{
    public static string GetAbsoluteXPathByAttribute(this XElement element, string attributeName)
    {
        Func<XElement, string> relativeXPath = e => RelativeXPathByAttribute(e, attributeName);
        return GetXPath(element, relativeXPath);
    }

    static string RelativeXPathByAttribute(XElement element, string attributeName)
    {
        var attr = element.Attribute(attributeName);
        if (attr != null)
        {
            var name = string.Format("*[@{0}={1}]", attributeName, XPathLiteral(attr.Value));
            var index = IndexPosition(element, e => { var a = e.Attribute(attributeName); return a != null && a.Value == attr.Value; });
            if (index != -1)
                name = string.Format(NumberFormatInfo.InvariantInfo, "/{0}[{1}]", name, index);
            return name;
        }
        else if (!string.IsNullOrEmpty(element.Name.Namespace.ToString()))
        {
            string name = string.Format("*[local-name()={0}]", XPathLiteral(element.Name.LocalName));
            var index = IndexPosition(element, e => e.Name.LocalName == element.Name.LocalName);
            if (index != -1)
                name = string.Format(NumberFormatInfo.InvariantInfo, "/{0}[{1}]", name, index);
            return name;
        }
        else
        {
            string name = element.Name.LocalName;
            var index = IndexPosition(element, e => e.Name == element.Name);
            if (index != -1)
                name = string.Format(NumberFormatInfo.InvariantInfo, "/{0}[{1}]", name, index);
            return name;
        }
    }

    public static int IndexPosition(this XElement element)
    {
        return IndexPosition(element, e => e.Name == element.Name);
    }

    public static int IndexPosition(XElement element, Func<XElement, bool> isRelevant)
    {
        if (element == null || isRelevant == null)
            throw new ArgumentNullException();

        if (element.Parent == null)
            return -1;

        int i = 1; // Indexes for nodes start at 1, not 0

        foreach (var sibling in element.Parent.Elements().Where(isRelevant))
        {
            if (sibling == element)
            {
                return i;
            }
            i++;
        }

        throw new InvalidOperationException("element has been removed from its parent.");
    }

    static string GetXPath(XElement element, Func<XElement, string> relativeXPath)
    {
        if (element == null)
            throw new ArgumentNullException("element");
        if (relativeXPath == null)
            throw new ArgumentNullException("relativeXPath");

        var ancestors = from e in element.Ancestors()
                        select relativeXPath(e);

        return string.Concat(ancestors.Reverse().ToArray()) +
               relativeXPath(element);
    }

    /// <summary>
    /// Produce an XPath literal equal to the value if possible; if not, produce
    /// an XPath expression that will match the value.
    /// From https://stackoverflow.com/questions/1341847/special-character-in-xpath-query
    /// 
    /// Note that this function will produce very long XPath expressions if a value
    /// contains a long run of double quotes.
    /// </summary>
    /// <param name="value">The value to match.</param>
    /// <returns>If the value contains only single or double quotes, an XPath
    /// literal equal to the value.  If it contains both, an XPath expression,
    /// using concat(), that evaluates to the value.</returns>
    static string XPathLiteral(string value)
    {
        // if the value contains only single or double quotes, construct
        // an XPath literal
        if (!value.Contains("\""))
        {
            return "\"" + value + "\"";
        }
        if (!value.Contains("'"))
        {
            return "'" + value + "'";
        }

        // if the value contains both single and double quotes, construct an
        // expression that concatenates all non-double-quote substrings with
        // the quotes, e.g.:
        //
        //    concat("foo", '"', "bar")
        StringBuilder sb = new StringBuilder();
        sb.Append("concat(");
        string[] substrings = value.Split('\"');
        for (int i = 0; i < substrings.Length; i++)
        {
            bool needComma = (i > 0);
            if (substrings[i] != "")
            {
                if (i > 0)
                {
                    sb.Append(", ");
                }
                sb.Append("\"");
                sb.Append(substrings[i]);
                sb.Append("\"");
                needComma = true;
            }
            if (i < substrings.Length - 1)
            {
                if (needComma)
                {
                    sb.Append(", ");
                }
                sb.Append("'\"'");
            }

        }
        sb.Append(")");
        return sb.ToString();
    }
}

And then, to test with and without namespaces:

public class TestClass
{
    static string GetXml()
    {
        string xml = @"<?xml version=""1.0"" encoding=""utf-8""?>
            <parent>
               <element href=""www.something.com"" title=""First"">
                    <element href=""www.something.com"" title=""Second"">
                       <element href=""www.something.com"" title=""Third"">
                       </element>    
                    </element>
                </element>    
                <element href=""www.something.com"" title=""Some title""></element>
                <element href=""www.something.com"" title=""Another"">
                    <element href=""www.something.com"" title=""Extra"">
                        <element href=""www.something.com"" title=""Page"">
                            <element href=""www.something.com"" title=""Target"">    
                            </element>
                        </element>    
                    </element>
                </element>
            </parent>";
        return xml;
    }

    static string GetXmlWithNamespace()
    {
        string xml = @"<?xml version=""1.0"" encoding=""utf-8""?>
            <parent                 
                xmlns=""urn:schemas-microsoft-com:office:spreadsheet"" 
                xmlns:o=""urn:schemas-microsoft-com:office:office"">
               <element href=""www.something.com"" title=""First"">
                    <element href=""www.something.com"" title=""Second"">
                       <element href=""www.something.com"" title=""Third"">
                       </element>    
                    </element>
                </element>    
                <element href=""www.something.com"" title=""Some title""></element>
                <element href=""www.something.com"" title=""Another"">
                    <element href=""www.something.com"" title=""Extra"">
                        <element href=""www.something.com"" title=""Page"">
                            <element href=""www.something.com"" title=""Target"">    
                            </element>
                        </element>    
                    </element>
                </element>
            </parent>";
        return xml;
    }

    public static void Test()
    {
        Test(GetXml());

        Test(GetXmlWithNamespace());
    }

    public static void Test(string xml)
    {
        var doc = XDocument.Parse(xml);

        var paths = doc.Root.DescendantsAndSelf().Select(e => new { Element = e, Path = e.GetAbsoluteXPathByAttribute("title") }).ToList();
        Debug.WriteLine(JsonConvert.SerializeObject(paths.Select(pair => pair.Path), Formatting.Indented));

        foreach (var pair in paths)
        {
            var result = doc.XPathSelectElement(pair.Path);
            if (result != pair.Element)
                Debug.Assert(result == pair.Element); // No asserts
        }
    }
}

Which produces the following paths that that correctly evaluate to the element in question:

  "parent",
  "parent/*[@title=\"First\"][1]",
  "parent/*[@title=\"First\"][1]/*[@title=\"Second\"][1]",
  "parent/*[@title=\"First\"][1]/*[@title=\"Second\"][1]/*[@title=\"Third\"][1]",
  "parent/*[@title=\"Some title\"][1]",
  "parent/*[@title=\"Another\"][1]",
  "parent/*[@title=\"Another\"][1]/*[@title=\"Extra\"][1]",
  "parent/*[@title=\"Another\"][1]/*[@title=\"Extra\"][1]/*[@title=\"Page\"][1]",
  "parent/*[@title=\"Another\"][1]/*[@title=\"Extra\"][1]/*[@title=\"Page\"][1]/*[@title=\"Target\"][1]"
Community
  • 1
  • 1
dbc
  • 104,963
  • 20
  • 228
  • 340