0

I use the code above, but the results returned are missing. Although, when I used it on regex101.com, regexr.com and RegexBuddy, it works correctly. it is missing text:

  • "[6]Thái Thượng Lão Quân (太上老君) là tôn hiệu một vị thần tiên tối cao trong Đạo giáo Trung Quốc, một trong Tam Thanh. Cõi của Thái Thượng Lão Quân ngự gọi là Thanh Cảnh."
  • "Chương ba: Mới nhập phàm trần đã gặp xui xẻo, làm trò cười cho thiên hạ! ...[7] Nam Thiên Môn là giao giới giữa trời và người."
public static string ReadHTMLCode(string URL)
    {
        WebClient webClient = new WebClient();
        byte[] reqHTML;
        reqHTML = webClient.DownloadData(URL);
        UTF8Encoding objUTF8 = new UTF8Encoding();
        return objUTF8.GetString(reqHTML);

    }//method read HTMLcode


public static string get_story_ttv(string url)
    {
        string source = Class_test.ReadHTMLCode(url);
        string pattern = @"[\w \”\.ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐÐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹếý\n\,\“\]\[\(\)\!\…\?\:\-\—\–]+(?=<br \/>\n<br \/>)|^[\w \”\.ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐÐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹếý\n\,\“\]\[\(\)\!\…\?\:\-\—\–]+$";
        Match m = Regex.Match(source, pattern);
        string S_rt = "";
        int kt = 0;
        while (m.Success)
        {
            S_rt += m.Groups[0].Value.Trim() + "\r\n";
            m = m.NextMatch();
        }
        return S_rt;
    }//method get text regex


private void button1_Click(object sender, EventArgs e)
    {
        string url = "http://www.tangthuvien.vn/forum/showthread.php?t=94781";
        richTextBox1.Text = Class_test.get_story_ttv(url);
        MessageBox.Show("DONE");
    }//event buttonClick
Sujeet Sinha
  • 2,417
  • 2
  • 18
  • 27
  • I suggest you check [HtmlAgilityPack](https://htmlagilitypack.codeplex.com/). It is very easy to parse HTML with it. – Wiktor Stribiżew Jun 22 '16 at 16:48
  • Is this a real question or are you trolling? http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 – Jodrell Jun 22 '16 at 16:50
  • Thank you! But I want to know why it is not working properly. – Amino Trần Jun 22 '16 at 16:51
  • When testing regular expressions make sure to use a site that supports .Net like http://regexstorm.net/tester – juharr Jun 22 '16 at 16:52
  • You'll have to forgive but, I don't want to download some HTML from a Vietnamese URL and and attempt to test your overly complex regular expression against it. Especially when I already know that using a regular expression to parse arbitrary HTML is a bad idea. Use HtmlAgilityPack, to get the text before you consider using a Regex, as @WiktorStribiżew suggests. – Jodrell Jun 22 '16 at 17:00
  • link regex101: https://regex101.com/r/oE9eO5/1 – Amino Trần Jun 22 '16 at 17:00
  • I will try and report back later. thank you! – Amino Trần Jun 22 '16 at 17:02

0 Answers0