-5

So, the html data I'm looking at is:

<A HREF="/data/client/Action.log">Action.log</A><br>  6/8/2015  3:45 PM 

From this I need to extract either instances of Action.log,

My problem is I've been over a ton of regex tutorials and I still can't seem to brain up a pattern to extract it. I guess I'm lacking some fundamental understanding of regex, but any help would be appreciated.

Edit:

internal string[] ParseFolderIndex_Alpha(string url, WebDirectory directory)
    {
        try
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            request.Timeout = 3 * 60 * 1000;
            request.KeepAlive = true;

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();

            if (response.StatusCode == HttpStatusCode.OK)
            {
                List<string> fileLocations = new List<string>(); string line;
                using (StreamReader reader = new StreamReader(response.GetResponseStream()))
                {
                    while ((line = reader.ReadLine()) != null)
                    {
                        int index = line.IndexOf("<a href=");
                        if (index >= 0)
                        {
                            string[] segments = line.Substring(index).Split('\"');

                            ///Can Parse File Size Here: Add todo
                            if (!segments[1].Contains("/"))
                            {
                                fileLocations.Add(segments[1]);
                                UI.UpdatePatchNotes("Web File Found: " + segments[1]);

                                UI.UpdateProgressBar();
                            }

                            else
                            {
                                if (segments[1] != @"../")
                                {
                                    directory.SubDirectories.Add(new WebDirectory(url + segments[1], this));
                                    UI.UpdatePatchNotes("Web Directory Found: " + segments[1].Replace("/", string.Empty));
                                }
                            }
                        }
                        else if (line.Contains("</pre")) break;
                    }
                }

                response.Dispose(); /// After ((line = reader.ReadLine()) != null)
                return fileLocations.ToArray<string>();
            }

            else return new string[0]; /// !(HttpStatusCode.OK)
        }

        catch (Exception e)
        {
            LogHandler.LogErrors(e.ToString(), this);
            LogHandler.LogErrors(url, this);
            return null;
        }
    }

That's what I was doing, the problem is I changed servers and the html IIS is displaying is different so I have to make new logic.

Edit / Conclusion:

First of all, I'm sorry I even mentions regex :P Secondly each platform will have to be handled individually depending on environment.

This is how I'm currently gathering the file names.

internal string[] ParseFolderIndex(string url, WebDirectory directory)
        {
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                request.Timeout = 3 * 60 * 1000;
                request.KeepAlive = true;

                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                bool endMet = false;
                if (response.StatusCode == HttpStatusCode.OK)
                {
                    List<string> fileLocations = new List<string>(); string line;
                    using (StreamReader reader = new StreamReader(response.GetResponseStream()))
                    {
                        while (!endMet)
                        {
                            line = reader.ReadLine();
                            if (line != null && line != "" && line.IndexOf("</A>") >= 0)
                            {
                                if (line.Contains("</html>")) endMet = true;
                                string[] segments = line.Replace("\\", "").Split('\"');
                                List<string> paths = new List<string>();
                                List<string> files = new List<string>();
                                for (int i = 0; i < segments.Length; i++)
                                {
                                    if (!segments[i].Contains('<'))
                                        paths.Add(segments[i]);
                                }

                                paths.RemoveAt(0);

                                foreach (String s in paths)
                                {
                                    string[] secondarySegments = s.Split('/');
                                    if (s.Contains(".") || s.Contains("Verinfo"))
                                        files.Add(secondarySegments[secondarySegments.Length - 1]);
                                    else
                                    {
                                        directory.SubDirectories.Add(new WebDirectory
                                            (url + "/" + secondarySegments[secondarySegments.Length - 2], this));
                                        UI.UpdatePatchNotes("Web Directory Found: " + secondarySegments[secondarySegments.Length - 2]);
                                    }

                                }

                                foreach (String s in files)
                                {
                                    if (!String.IsNullOrEmpty(s) && !s.Contains('%'))
                                    {
                                        fileLocations.Add(s);
                                        UI.UpdatePatchNotes("Web File Found: " + s);

                                        UI.UpdateProgressBar();
                                    }
                                }

                                if (line.Contains("</pre")) break;
                            }
                        }
                    }

                    response.Dispose(); /// After ((line = reader.ReadLine()) != null)
                    return fileLocations.ToArray<string>();
                }

                else return new string[0]; /// !(HttpStatusCode.OK)
            }

            catch (Exception e)
            {
                LogHandler.LogErrors(e.ToString(), this);
                LogHandler.LogErrors(url, this);
                return null;
            }
        }
Eidenai
  • 401
  • 6
  • 18
  • Do you want the value of the anchor tag? – grmbl Mar 04 '16 at 11:29
  • 3
    Show what you have tried, and show some example input and output. – CodeCaster Mar 04 '16 at 11:29
  • No, literally the file's name. I at first was using split, but I figured regex would be a smarter solution. – Eidenai Mar 04 '16 at 11:29
  • You know what they say about regex? Before you had 1 problem, apply regex and you have 2 :) But I agree regex is cool! – grmbl Mar 04 '16 at 11:31
  • 7
    Did you consider trying an HTML parser, e.g. HtmlAgilityPack, to retrieve data from HTML? Using regex for this purpose leads to cumbersome code (look, what you have is already cumbersome), and issues like the one you are having. – Wiktor Stribiżew Mar 04 '16 at 11:32
  • @grmbl - I've never head that, good to know. :) – Eidenai Mar 04 '16 at 11:33
  • @CodeCaster I added some code to my post, but like I said in the edit I changed my server hosting and the html iis is outputing is different than the webhost I was using. – Eidenai Mar 04 '16 at 11:33
  • 3
    Listen to Wiktor Stribiżew. I will direct you attention to the famous SO post about parsing HTML with RegEx http://stackoverflow.com/a/1732454/201648 – Aaron Newton Mar 04 '16 at 11:33
  • @WiktorStribiżew I have considered it, but I'd prefer to do it myself. It's like exercise I guess. – Eidenai Mar 04 '16 at 11:34
  • Ok, I see. You will understand me sooner or later. It took me some time, too. You can only use regex when there is no way to do it without, as in some tools that only use S&R rules based on regex patterns. You have the full power of C#, and it is not a good idea to refuse from that. – Wiktor Stribiżew Mar 04 '16 at 11:52
  • Like I said, I ended up not even using regex. – Eidenai Mar 04 '16 at 13:14
  • @WiktorStribiżew when you say html parser, do you mean a third party application? I'll look into HtmlAgilityPack. – Eidenai Mar 09 '16 at 06:45
  • 1
    Yes, this library is good to use if you need to extract or modify HTML in .NET. I tried some others, too, but this one seems powerful enough ti handle both extraction and HTML data manipulation. – Wiktor Stribiżew Mar 09 '16 at 07:07
  • May I ask the benefits of using their library? – Eidenai Mar 09 '16 at 07:34

3 Answers3

1

Try matching the following pattern:

<A HREF="(?<url>.*)">

Then get the group called url from the match results.

Working example: https://regex101.com/r/hW8iH6/1

Jakub Konecki
  • 45,581
  • 7
  • 87
  • 126
1

Regex for this is overkill. It's too heavy, and considering the format of the string will always be the same, you're going to find it easier to debug and maintain using splitting and substrings.

 class Program {
    static void Main(string[] args) {

        String s = "<A HREF=\"/data/client/Action.log\">Action.log</A><br>  6/8/2015  3:45 PM ";

        String[] t = s.Split('"');

        String fileName = String.Empty;

        //To get the entire file name and path....
        fileName = t[1].Substring(0, (t[1].Length));

        //To get just the file name (Action.log in this case)....
        fileName = t[1].Substring(0, (t[1].Length)).Split('/').Last();
    }
}
Alex
  • 1,643
  • 1
  • 14
  • 32
  • 2
    This is what I ended up doing. :) Thanks to everyone else who answered, I appreciate you looking past the majority consensus. I don't know why they down voted these answers but I up-voted all three. – Eidenai Mar 04 '16 at 11:42
  • 1
    I'm curious as to why this was downvoted. The URL format shows a log file, which is likely to be a specific logging format. These don't change normally, but even if it did, splitting on the double quote will always leave the file name as the second token, so whats the point in regex? Regex is excellent for so many applications, but if your source data is in a known format, you're evaluating things you already know and can predict! – Alex Mar 04 '16 at 12:07
-1
string text = @"<A HREF=""/data/client/Action.log"">Action.log</A><br>  6/8/2015  3:45 PM";

            var match = Regex.Match(text, @"^<A HREF=\""\/data\/client\/.*\.log\"">(.*)</A>.*$");
            var result = match.Groups[1].Value;

Try http://regexr.com/ or Regexbuddy!

grmbl
  • 2,514
  • 4
  • 29
  • 54