1

OUnit.assert_equal ~pp_diff allows pretty-printing of expected/actual value differences and OUnitDiff seems to provide differs for collections.

Is there a stock pp_diff for string values though? Ideally one that makes a best-effort to expand diffs to the closest UTF-8 sequence boundaries.

Even common prefix/suffix elimination would be better than nothing.

Mike Samuel
  • 118,113
  • 30
  • 216
  • 245
  • 1
    You could probably take some elements from `patdiff`; Or create an edit-distance matrix and obtain a backtrace to get the edited strings if they are short. – nlucaroni Apr 08 '14 at 22:16

1 Answers1

3

An amusing morning challenge.

type move = Same | Add | Del

let edit_distance_matrix a b =
  (* The usual dynamic edit distance algorithm, except we keep
     a complete matrix of moves to be able to step back and see which
     operations can turn [sa] into [sb].

     This is not very efficient: we keep the complete matrices of
     distances (costs) and moves. One doesn't need to know the move
     for all cases of the matrix, only those that are on the "best"
     path from begin to end; it would be better to recompute the moves
     along the path after the facts. There probably also exists
     a classic clever trick to apply the usual optimization of keeping
     only two rows of the matrix at any time, and still compute the
     best path along the way. 
  *)
  let la, lb = String.length a, String.length b in
  let m = Array.make_matrix (la + 1) (lb + 1) (-1) in
  let moves = Array.make_matrix (la + 1) (lb + 1) Same in
  m.(0).(0) <- 0;
  for i = 1 to la do
    m.(i).(0) <- i;
  done;
  for j = 1 to lb do
    m.(0).(j) <- j;
  done;
  for i = 1 to la do
    for j = 1 to lb do
      let best, move =
        if a.[i-1] = b.[j-1] then m.(i-1).(j-1), Same
        else
          if m.(i-1).(j) <= m.(i).(j-1)
          then m.(i-1).(j) + 1, Del
          else m.(i).(j-1) + 1, Add
      in
      m.(i).(j) <- best;
      moves.(i).(j) <- move;
    done;
  done;
  m, moves

let get m (i, j) = m.(i).(j)

let valid m pos =
  fst pos >= 0 && snd pos >= 0

let previous (i, j) = function
  | Same -> (i - 1, j - 1)
  | Add -> (i, j - 1)
  | Del -> (i - 1, j)

let cons _pos action = function
  | (action', n) :: rest when action = action' ->
    (action', n+1) :: rest
  | list -> (action, 1) :: list

(** walk back along the "best path", taking notes of changes to make
    as we go *)
let chunks moves =
  let la = Array.length moves - 1 in
  let lb = Array.length moves.(0) - 1 in
  let start = (la, lb) in
  let rec loop acc pos =
    let move = get moves pos in
    let next_pos = previous pos move in
    (* if the next position is not valid,
       the current move is a dummy move,
       and it must not be returned as part of [acc] *)
    if not (valid moves next_pos) then acc
    else loop (cons pos move acc) next_pos
  in loop [] start

(** print the list of changes in term of the original string

    We skip large parts of the string that are common, keeping only
    [context] characters on the sides to provide some context.
*)
let diff context sa sb =
  let cost, moves = edit_distance_matrix sa sb in
  let chks = chunks moves in
  let buf = Buffer.create cost.(String.length sa).(String.length sb) in
  let rec loop i j = function
    | [] -> ()
    | (Same, n) :: rest ->
      if n <= 2 * context then
        Buffer.add_substring buf sa i n
      else begin
        Buffer.add_substring buf sa i context;
        Buffer.add_string buf "...\n...";
        Buffer.add_substring buf sa (i + n - context) context;
      end;
      loop (i + n) (j + n) rest
    | (Add, n) :: rest ->
      begin
        Buffer.add_string buf "[+";
        Buffer.add_substring buf sb j n;
        Buffer.add_char buf ']';
      end;
      loop i (j + n) rest 
    | (Del, n) :: rest ->
      begin
        Buffer.add_string buf "[-";
        Buffer.add_substring buf sa i n;
        Buffer.add_char buf ']';
      end;
      loop (i + n) j rest 
  in
  begin
    try loop 0 0 chks with _ -> ()
  end;
  Buffer.contents buf

Test:

# print_endline @@ diff 4
    "le gros chat mange beaucoup de croquettes au saumon"
    "le chat maigre mange peu de croquettes au saumon"
  ;;
le[- gros] chat[+ maigre] mange [+p][-b]e[-auco]u[-p] de ...
...umon
gasche
  • 31,259
  • 3
  • 78
  • 100
  • No, there is no way to keep the two rows AND get a back-trace. There are efficiencies using an algorithm we developed at my previous job --the data-structure is diagonally based instead of row--, but it wouldn't be helpful in tiny matrices like these due to the number of if statements necessary --it was for molecular alignments. – nlucaroni Apr 09 '14 at 15:39
  • What we could do is store a two-row, not of moves, but of paths (best paths so far): this is still `O(|a|*|b|)` in the worst-case, but we have a compression technique for paths that works relatively well in practice, so on actual inputs memory consumption should be much smaller. – gasche Apr 09 '14 at 17:02
  • Sure, I got that. Presumably then one wouldn't also store every prefix associated with the cells, but a reasonable subset? I would expect that to work well on reasonable input too. About the compression technique for paths, are you referring to a trie? – nlucaroni Apr 09 '14 at 17:51
  • Only the chunked representation that is used later in my code: compressing consecutive moves of the same nature. Thanks, by the way, for the information on the two-rows+paths case, I'll admit I never was an expert in dynamic programming. – gasche Apr 09 '14 at 17:59