2

In My program i have used string variable content. I have assigned a small HTML program for this string. For Example,

String content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href="mailto:support@yourcompany.com">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";

From this i want to get "This is a Medium Header Send me mail at support@yourcompany.com.This is a new sentence without a paragraph break." alone.

This string available inside the tag. how i get this string using c#.

Ajar
  • 162
  • 1
  • 10

2 Answers2

7

Don't use string methods or regex to parse HTML. You can use HtmlAgilityPack.

string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:support@yourcompany.com\">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";

var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(content);
string headerText = doc.DocumentNode.Descendants("H2").First().InnerText;

Result:

This is a Medium Header Send me mail atsupport@yourcompany.com.This is a new sentence without a paragraph break.
carla
  • 1,970
  • 1
  • 31
  • 44
Tim Schmelter
  • 450,073
  • 74
  • 686
  • 939
-1

Complete sample

HtmlFormatHelper.cs:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace Tools
{
    /// <summary>
    /// набор утилит для форматирования HTML текста
    /// </summary>
    public static class HtmlFormatHelper
    {
        private static Regex _regexLineBreak;
        private static Regex _regexStripFormatting;
        private static Regex _regexTagWhiteSpace;
        private static Regex _regexHyperlink;

        /// <summary>
        /// статический конструктор
        /// </summary>
        static HtmlFormatHelper()
        {
            _regexLineBreak = new Regex(@"<(br|BR|p|P)\s{0,1}\/{0,1}>\s*|</[pP]>", RegexOptions.Singleline);
            _regexStripFormatting = new Regex(@"<[^>]*(>|$)", RegexOptions.Singleline);
            _regexTagWhiteSpace = new Regex(@"(>|$)(\W|\n|\r)+<", RegexOptions.Singleline);
            _regexHyperlink = new Regex(@"<a\s+[^>]*href\s*=\s*[""']?([^""'>]+)[""']?[^>]*>([^<]+)</a>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
        }

        /// <summary>
        /// конвертировать HTML в текст
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string HtmlToPlainText(string html)
        {
            var text = html;

            text = System.Net.WebUtility.HtmlDecode(text);
            text = _regexTagWhiteSpace.Replace(text, "><");
            text = _regexLineBreak.Replace(text, Environment.NewLine);
            text = _regexStripFormatting.Replace(text, string.Empty);

            return text;
        }

        /// <summary>
        /// конвертировать HTML в текст с "умным" оформлением
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string HtmlToPlainTextSmart(string html)
        {
            // обрабатываем ссылки
            html = _regexHyperlink.Replace(html, e =>
            {
                string url = e.Groups[1].Value.Trim();
                string text = e.Groups[2].Value.Trim();

                if (url.Length == 0 || string.Equals(url, text, StringComparison.InvariantCultureIgnoreCase))
                {
                    // ссылки идентичны или ссылка отсутствует
                    return e.Value;
                }
                else
                {
                    // ссылки отличаются
                    return string.Format("{0} ({1})", text, url);
                }
            });

            return HtmlToPlainText(html);
        }

        /// <summary>
        /// кодировать HTML код с "мягком" режиме
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string SoftHtmlEncode(string html)
        {
            if (html == null)
            {
                return null;
            }
            else
            {
                StringBuilder sb = new StringBuilder(html.Length);

                foreach (char c in html)
                {
                    if (c == '<')
                    {
                        sb.Append("&lt;");
                    }
                    else if (c == '>')
                    {
                        sb.Append("&gt;");
                    }
                    else
                    {
                        sb.Append(c);
                    }
                }

                return sb.ToString();
            }
        }
    }
}

How to use:

// input string
string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:support@yourcompany.com\">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";

// extract html body
string htmlBody = Regex.Match(content, @"^.*?<body>(.*)</body>.*?$", RegexOptions.IgnoreCase).Groups[1].Value;

// plain text
string plainText = Tools.HtmlFormatHelper.HtmlToPlainText(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com.This is a new sentence without a paragraph break.

// plain text (with url in brackets)
string plainTextSmart = Tools.HtmlFormatHelper.HtmlToPlainTextSmart(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com (mailto:support@yourcompany.com).This is a new sentence without a paragraph break.
Community
  • 1
  • 1
General-Doomer
  • 2,681
  • 13
  • 13
  • Never use regex for HTML matching or parsing http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags – Nico May 08 '15 at 10:23
  • In some cases it is more efficient than using of external libraries. – General-Doomer May 08 '15 at 10:25
  • Nope never. Read the post. – Nico May 08 '15 at 10:29
  • It's not only less efficient, the point is that it simply doesn't work with regex in many cases. And nobody wants to maintain that regex library if you've found the next edge case. Imo that's a ticking time bomb. – Tim Schmelter May 08 '15 at 11:02