(This question about refactoring F# code got me one down vote, but also some interesting and useful answers. And 62 F# questions out of the 32,000+ on SO seems pitiful, so I'm going to take the risk of more disapproval!)
I was trying to post a bit of code on a blogger blog yesterday, and turned to this site, which I had found useful in the past. However, the blogger editor ate all the style declarations, so that turned out to be a dead end.
So (like any hacker), I thought "how hard can it be?" and rolled my own in <100 lines of F#.
Here is the 'meat' of the code, which turns an input string into a list of 'tokens'. Note that these tokens aren't to be confused with the lexing/parsing-style tokens. I did look at those briefly, and though I hardly understood anything, I did understand that they would give me only tokens, whereas I want to keep my original string.
The question is: is there a more elegant way of doing this? I don't like the n re-definitions of s required to remove each token string from the input string, but it's difficult to split the string into potential tokens in advance, because of things like comments, strings and the #region directive (which contains a non-word character).
//Types of tokens we are going to detect
type Token =
| Whitespace of string
| Comment of string
| Strng of string
| Keyword of string
| Text of string
| EOF
//turn a string into a list of recognised tokens
let tokenize (s:String) =
//this is the 'parser' - should we look at compiling the regexs in advance?
let nexttoken (st:String) =
match st with
| st when Regex.IsMatch(st, "^\s+") -> Whitespace(Regex.Match(st, "^\s+").Value)
| st when Regex.IsMatch(st, "^//.*?\r?\n") -> Comment(Regex.Match(st, "^//.*?\r?\n").Value) //this is double slash-style comments
| st when Regex.IsMatch(st, "^/\*(.|[\r?\n])*?\*/") -> Comment(Regex.Match(st, "^/\*(.|[\r?\n])*?\*/").Value) // /* */ style comments http://ostermiller.org/findcomment.html
| st when Regex.IsMatch(st, @"^""([^""\\]|\\.|"""")*""") -> Strng(Regex.Match(st, @"^""([^""\\]|\\.|"""")*""").Value) // unescaped = "([^"\\]|\\.|"")*" http://wordaligned.org/articles/string-literals-and-regular-expressions
| st when Regex.IsMatch(st, "^#(end)?region") -> Keyword(Regex.Match(st, "^#(end)?region").Value)
| st when st <> "" ->
match Regex.Match(st, @"^[^""\s]*").Value with //all text until next whitespace or quote (this may be wrong)
| x when iskeyword x -> Keyword(x) //iskeyword uses Microsoft.CSharp.CSharpCodeProvider.IsValidIdentifier - a bit fragile...
| x -> Text(x)
| _ -> EOF
//tail-recursive use of next token to transform string into token list
let tokeneater s =
let rec loop s acc =
let t = nexttoken s
match t with
| EOF -> List.rev acc //return accumulator (have to reverse it because built backwards with tail recursion)
| Whitespace(x) | Comment(x)
| Keyword(x) | Text(x) | Strng(x) ->
loop (s.Remove(0, x.Length)) (t::acc) //tail recursive
loop s []
tokeneater s
(If anyone is really interested, I am happy to post the rest of the code)
EDIT Using the excellent suggestion of active patterns by kvb, the central bit looks like this, much better!
let nexttoken (st:String) =
match st with
| Matches "^\s+" s -> Whitespace(s)
| Matches "^//.*?\r?(\n|$)" s -> Comment(s) //this is double slash-style comments
| Matches "^/\*(.|[\r?\n])*?\*/" s -> Comment(s) // /* */ style comments http://ostermiller.org/findcomment.html
| Matches @"^@?""([^""\\]|\\.|"""")*""" s -> Strng(s) // unescaped regexp = ^@?"([^"\\]|\\.|"")*" http://wordaligned.org/articles/string-literals-and-regular-expressions
| Matches "^#(end)?region" s -> Keyword(s)
| Matches @"^[^""\s]+" s -> //all text until next whitespace or quote (this may be wrong)
match s with
| IsKeyword x -> Keyword(s)
| _ -> Text(s)
| _ -> EOF