5

I am beginning Erlang and as an exercise I tried to implement the CYK algorithm.

Main code(cyk.erl):

%%% An attempt for a CYK parser in Erlang

-module(cyk).

-export([
         init_storage/0,
         import_grammar_file/1,
         add_grammar_rule/1,
         analyze/1,
         test_analyze/0
        ]).

%% Initialize the ets storage for grammar
init_storage() ->
  ets:new(?MODULE, [bag, named_table]).

%%------------------------------------------
%% 
%% Grammar
%%
%%------------------------------------------

%% Import a grammar file
import_grammar_file(File) ->
  {ok, Device} = file:open(File, read),
  import_file_rules(Device).

%% Import all the rules in the file
import_file_rules(Device) ->
  case io:get_line(Device, "") of
    eof ->
      io:format("Grammar file imported~n"),
      file:close(Device);
    Line ->
      add_grammar_rule(Line),
      import_file_rules(Device)
  end.

%% Add a grammar rule
add_grammar_rule(Rule) ->
  case re:run(Rule, "^([^\s]+)\s?->\s?([^\n]+)$", [{capture, all_but_first, binary}]) of
    {match, [A, B]} ->
      ets:insert(?MODULE, {A, B}),
      io:format("parsing ~p -> ~p~n", [A, B]);
    nomatch ->
      io:format("cannot parse ~p~n", [Rule])
  end.

%%------------------------------------------
%% 
%% Main logic
%%
%%------------------------------------------

%% Analyze a sentence
analyze(Sentence) ->
  io:format("analysing: ~p~n", [Sentence]),
  WordList = re:split(Sentence, " "),
  io:format("wordlist: ~p~n", [WordList]),
  Representation = lists:map( fun(Word) -> associate(Word) end, WordList),
  io:format("representation: ~p~n", [Representation]),
  Result = process([Representation]),
  io:format("result: ~p~n", [Result]).

% associate sentence words with grammar terms
associate(Word) ->
  case ets:match(cyk, {'$1', Word}) of
    [H|T] -> lists:flatten([H|T]);
    [] -> []
  end.

% process sentence representation
process(Representation) ->
  Limit = length(lists:last(Representation)),
  process(Representation, Limit).

process(Representation, Limit) when Limit > 1 ->
  NextStep = process(Representation, 1, Limit-1, []),
  process([NextStep|Representation], Limit-1);
process(Representation, _Limit) ->
  Representation.

process(Representation, Index, Limit, Acc) when Index =< Limit ->
  Subtree = extract_subtree(lists:reverse(Representation), Index),
  Result = process_subtree(Subtree),
  process(Representation, Index+1, Limit, [Result|Acc]);
process(_Representation, _Index, _Limit, Acc) ->
  lists:reverse(Acc).

%%------------------------------------------
%% 
%% Subtree
%%
%%------------------------------------------

process_subtree(Subtree) ->
  process_subtree(Subtree, Subtree, [], 1).

process_subtree([], _Subtree, Acc, _Index) ->
  Acc;
process_subtree([H|T], Subtree, Acc, Index) ->
  A = lists:nth(1,H),
  Bind = length( Subtree ) - Index + 1,
  B = lists:last( lists:nth( Bind, Subtree) ),
  % generating the possibilities of grammar
  Pos = [ list_to_binary(binary:bin_to_list(X)++" "++binary:bin_to_list(Y)) || X<-A, Y<-B ],
  % looking up in the grammar
  Result = lists:flatten( [ ets:match(cyk, {'$1', X}) || X <- Pos ] ),
  process_subtree(T, Subtree, Acc++Result, Index + 1).

%% Extract a subtree from the representation 
extract_subtree(Representation, Position) ->
  Size = length(Representation) + 1,
  extract_subtree(Representation, Size, Position, []).

extract_subtree([], _Size, _Position, Acc) ->
  lists:reverse(Acc);
extract_subtree([H|T], Size, Position, Acc) ->
  Segment = lists:sublist(H, Position, Size),
  extract_subtree(T, Size - 1, Position, [Segment|Acc]).

%%------------------------------------------
%% 
%% Test
%% using the same example as 
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze() ->
  init_storage(),
  import_grammar_file("grammar.txt"),
  analyze("she eats a fish with a fork").

The grammar file (grammar.txt)

S -> NP VP
VP -> VP PP
VP -> V NP
VP -> eats
PP -> P NP
NP -> Det N
NP -> she 
V -> eats
P -> with
N -> fish
N -> fork
Det -> a

The code can be tested from the erlang shell

> c(cyk).
> cyk:test_analyze().
parsing <<"S">> -> <<"NP VP">>
parsing <<"VP">> -> <<"VP PP">>
parsing <<"VP">> -> <<"V NP">>
parsing <<"VP">> -> <<"eats">>
parsing <<"PP">> -> <<"P NP">>
parsing <<"NP">> -> <<"Det N">>
parsing <<"NP">> -> <<"she">>
parsing <<"V">> -> <<"eats">>
parsing <<"P">> -> <<"with">>
parsing <<"N">> -> <<"fish">>
parsing <<"N">> -> <<"fork">>
parsing <<"Det">> -> <<"a">>
Grammar file imported
analysing: "she eats a fish with a fork"
wordlist: [<<"she">>,<<"eats">>,<<"a">>,<<"fish">>,<<"with">>,<<"a">>,
           <<"fork">>]
representation: [[<<"NP">>],
                 [<<"VP">>,<<"V">>],
                 [<<"Det">>],
                 [<<"N">>],
                 [<<"P">>],
                 [<<"Det">>],
                 [<<"N">>]]
result: [[[<<"S">>]],
         [[],[<<"VP">>]],
         [[],[],[]],
         [[<<"S">>],[],[],[]],
         [[],[<<"VP">>],[],[],[<<"PP">>]],
         [[<<"S">>],[],[<<"NP">>],[],[],[<<"NP">>]],
         [[<<"NP">>],
          [<<"VP">>,<<"V">>],
          [<<"Det">>],
          [<<"N">>],
          [<<"P">>],
          [<<"Det">>],
          [<<"N">>]]]

The code seems to work fine for this example, but I was looking for ways to improve it (make it more erlang-ish) and specially to make the processing distributed on multiple process/nodes.

I guess all the process_subtree executions for each step could be done concurrent, but I can't really figure how.

Any suggestions will be greatly appreciated!

Eric
  • 2,784
  • 1
  • 20
  • 25
  • except the fact that it didn't crash, looking at the result, why should I think it seems to work? :o) – Pascal Apr 30 '14 at 19:20
  • Readability is not very good, but the result represents the CYK matrix table. As the values computed are the same with http://en.wikipedia.org/wiki/CYK_algorithm#Example the processing seems correct. (at least for the tested case) – Eric Apr 30 '14 at 22:37
  • Sorry for the joke ... I can't resit. I read the wikipedia example and yes, it is clear... I am thinking in background how to answer your question, if I have some time I'll propose you something, although I am not sure that it will be more efficient. – Pascal May 01 '14 at 06:58
  • I posted a proposal, maybe not you are asking for... If you prefer searching by yourself, the idea is to launch in parallel as many processes that there are words in the sentence, and let them launch new ones for the next step of analysis. – Pascal May 02 '14 at 13:21
  • I have edited the answer, the wait function was really annoying me. – Pascal May 05 '14 at 12:16

1 Answers1

4

I have written this solution which use concurrent execution.

Compare to Eric solution, some changes were needed for the usage of multi-processes, some other because I think it is more efficient (I reverted keys and values in the rules ets, and I have chosen a set), some because I think it is cleaner (I close the grammar file in the function that open it) and some because I am more familiar with these modules (string:tokens ...).

[edit]

I have replaced a useless spawn by faster recursive call, and suppressed the wait function by adding a message to synchronize the processes.

I got the idea of this implementation looking at the nice animation at a Javascript animation of the CYK algorithm, which is unfortunately no longer available.

@Eric, it is possible to look at all steps of the analysis opening the ets analyze with observer, it is why I do not delete it.

-module(cyk).

-export([
         import_grammar_file/1,
         add_grammar_rule/2,
         analyze/1,
         test_analyze/1,
         test_analyze/0
        ]).

%%------------------------------------------
%% 
%% Grammar
%%
%%------------------------------------------

%% Import a grammar file
import_grammar_file(File) ->
  reset_ets(rules, ets:info(rules)),
  {ok, Device} = file:open(File, read),
  ok = add_grammar_rule(Device,file:read_line(Device)),
  file:close(Device),
  io:format("Grammar file imported~n").

%% Add a grammar rule
add_grammar_rule(_,eof) -> ok;
add_grammar_rule(Device,{ok,Rule}) ->
  [T,"->",H|Q] = string:tokens(Rule," \n"),
  Key = key(H,Q),
  insert(Key,T,ets:lookup(rules, Key)),  
  add_grammar_rule(Device,file:read_line(Device)).

key(H,[]) -> H;
key(H,[Q]) -> {H,Q}.

insert(Key,T,[]) -> ets:insert(rules, {Key,[T]});
insert(Key,T,[{Key,L}]) -> ets:insert(rules, {Key,[T|L]}).


%%------------------------------------------
%% 
%% Main logic
%%
%%------------------------------------------

%% Analyze a sentence
analyze(Sentence) ->
  reset_ets(analyze, ets:info(analyze)),
  io:format("analysing: ~p~n", [Sentence]),
  WordList = string:tokens(Sentence, " "),
  Len = length(WordList),
  Me = self(),
  lists:foldl(fun(X,{J,Pid}) -> ets:insert(analyze,{{0,J},ets:lookup_element(rules,X,2)}),
                          (NewPid = spawn(fun() -> whatis(1,J,Len,Pid,Me) end)) ! {done,0},
                          {J+1,NewPid} end,
                        {1,none}, WordList),
  receive
    M -> M
  end.

reset_ets(Name, undefined) -> ets:new(Name,[set, named_table,public]);
reset_ets(Name, _) -> ets:delete_all_objects(Name).

whatis(Len,1,Len,_,PidRet) -> PidRet ! ets:lookup_element(analyze,{Len-1,1},2); % finished
whatis(I,J,Len,_,_) when I + J == Len +1 -> ok; % ends useless processes
whatis(I,J,Len,Pid,PidRet) ->
  receive {done,V} when V == I-1 -> ok end,
  Cases = lists:map(fun({X,Y}) -> [{A,B} || A <- ets:lookup_element(analyze,X,2), 
                                            B <- ets:lookup_element(analyze,Y,2)] end,
                         [{{X-1,J},{I-X,J+X}} || X <- lists:seq(1,I)]),
  Val = lists:foldl(fun(X,Acc) -> case ets:lookup(rules,X) of
                                      [] -> Acc;
                                      [{_,[R]}] -> [R|Acc]
                                      end end,
                                      [],lists:flatten(Cases)),
  ets:insert(analyze,{{I,J},Val}),
  send(Pid,I),
  whatis(I+1,J,Len,Pid,PidRet).
  
send(none,_) -> ok;
send(Pid,I) -> Pid ! {done,I}.

%%------------------------------------------
%% 
%% Test
%% using the same example as 
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze(S) ->
  import_grammar_file("grammar.txt"),
  analyze(S).

test_analyze() ->
  test_analyze("she eats a fish with a fork").
rici
  • 234,347
  • 28
  • 237
  • 341
Pascal
  • 13,977
  • 2
  • 24
  • 32