I am trying to match chess notation. I have a C# regular expression like this:
"(?:[PNBRQK]?[a-h]?[1-8]?x?[a-h][1-8](?:\=[PNBRQK])?|O(-?O){1,2})[\+#]?(\s*[\!\?]+)?";
[I would not mind a C# YACC lexer for Short Algebraic Notation (SAN), but I am using regex for now:]
<move> ::= <move number><move descriptor>
<move number> ::= <digit>[<digit>...]{'.' | '...'}
<move descriptor> ::= <from square><to square>[<promoted to>]
<square> ::= <file letter><rank number>
<file letter> ::= 'a'|'b'|'c'|'d'|'e'|'f'|'g'|'h'
<rank number> ::= '1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'
<promoted to> ::= 'q'|'r'|'b'|'n'
<Piece symbol> ::= 'P' | 'N' | 'B' | 'R' | 'Q' | 'K'
<SAN move descriptor piece moves> ::= <Piece symbol>[<from file>|<from rank>|<from
square>]['x']<to square>
<SAN move descriptor pawn captures> ::= <from file>[<from rank>] 'x' <to square>[<promoted to>]
<SAN move descriptor pawn push> ::= <to square>[<promoted to>]
Sometimes the above regex matches too much, for example these first few moves are matched this way (I remove move numbers before matching):
1e4d5
2exd5Nf6
3d4Qxd5
4c4Qd6
5Nf3c5
6Be3cxd4
7Nxd4a6
8Be2e5
9Nf3Nc6
10O-OQxd1
11Rxd1Be7
12Nc3Be6
13Nd5Bd8
14Nb6Rb8
15Ng5Bf5
16Bf3e4
17Be2h6
18Nh3Bxh3
19gxh3O-O
20Nd7Nxd7
21Rxd7Bf6
22Bf4Rfd8
23Rad1Be5
24Bxe5Rxd7
25Rxd7Nxe5
26Re7Nf3+
27Kg2f5
28Bxf3exf3+
29Kxf3Rc8
30b3b5
31cxb5axb5
32Rb7Ra8
33Rxb5Rxa2
34Rxf5Rb2
35Rb5Kf7
36Rb7+Kf6
37h4g5
38h5Ke5
39Rb6Kd5
40Rxh6Rxb3+
41Kg4Rb2
Matches to:
e4d5
exd5
Nf6
d4
Qxd5
c4
Qd6
Nf3c5
Result is supposed to be (the code adds a period after the move # but not necessary):
1e4 d5
2exd5 Nf6
3d4 Qxd5
4c4 Qd6
5Nf3 c5
6Be3 cxd4
7Nxd4 a6
8Be2 e5
9Nf3 Nc6
10O-O Qxd1
11Rxd1 Be7
12Nc3 Be6
13Nd5 Bd8
14Nb6 Rb8
15Ng5 Bf5
16Bf3 e4
17Be2 h6
18Nh3 Bxh3
19gxh3 O-O
20Nd7 Nxd7
21Rxd7 Bf6
22Bf4 Rfd8
23Rad1 Be5
24Bxe5 Rxd7
25Rxd7 Nxe5
26Re7 Nf3+
27Kg2 f5
28Bxf3 exf3+
29Kxf3 Rc8
30b3 b5
31cxb5 axb5
32Rb7 Ra8
33Rxb5 Rxa2
34Rxf5 Rb2
35Rb5 Kf7
36Rb7+ Kf6
37h4 g5
38h5 Ke5
39Rb6 Kd5
40Rxh6 Rxb3+
41Kg4 Rb2
Notice that the first and fifth moves are wrong, since it matches both white's and black's move.
What is the modification to my regex to get it to work so that it always just matches one sides move?
Here is the code
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
namespace ChessPGNParserConsoleApplication
{
class Program
{
static void Main(string[] args)
{
string regexStr = @"(?:[PNBRQK]|[a-h][1-8]?x)?[a-h][1-8]|(O(-?O){1,2})[\+#]?(\s*[\!\?]+)?";
//string regexStr = @"(?:[PNBRQK]|[a-h][1-8]?x)?[a-h][1-8](?:\=[PNBRQK])?(O(-?O){1,2})[\+#]?(\s*[\!\?]+)?";
//string regexStr = @"(?:[PNBRQK]?[a-h]?[1-8]?x?[a-h][1-8](?:\=[PNBRQK])?|O(-?O){1,2})[\+#]?(\s*[\!\?]+)?";
//string regexStr = @"(?:[PNBRQK]?[a-h]?[1-8]?x?[a-h][1-8](?:\=[PNBRQK])?|O(-?O){1,2})[\+#]?(\s*[\!\?]+)?";
//string regexStr = @"(?:[PNBRQK]?[a-h]?[1-8]?x?[a-h][1-8])";
string startsDigitRegexStr = @"^\d*";
Regex regexpr = new Regex(regexStr);
Regex regexprDigit = new Regex(startsDigitRegexStr);
// Read the file and display it line by line.
System.IO.StreamReader file = new System.IO.StreamReader(@"C:\Users\idf\Documents\My Chess Database\chessgame.txt");
string replacement = "";
int moveNumber = 1;
string line;
while (null != (line = file.ReadLine()))
{
MatchCollection mcDigit = regexprDigit.Matches(line);
foreach (Match m in mcDigit)
{
line = regexprDigit.Replace(line, replacement);
//Console.WriteLine(m);
}
//Console.WriteLine(line);
MatchCollection mc = regexpr.Matches(line);
int twoMoves = 0;
Console.Write(moveNumber.ToString() + ". ");
foreach (Match m in mc)
{
Console.Write(m + " ");
if(1 == twoMoves++)
Console.WriteLine();
}
moveNumber++;
}
Console.ReadLine();
}
}
}