String Distances

Revisiting String Distances

Back in April I wrote about three different string distance algorithms.  If you’re not familiar with string distance algorithms I encourage you to read the original post but in short, these algorithms provide different ways to determine how many steps are required to translate one string into another.  Since the algorithms I wrote about aren’t particularly complex I thought translating them to F# would be a fun exercise.

In general the code is pretty similar to its C# counterpart but I was able to take advantage of a few F# constructs, tuples and pattern matching in particular.  These examples may or may not be the best way to implement the algorithms in F# (ok, I’ll be honest, they’re probably not) but they certainly show how well suited F# is to this type of work.

Hamming Distance

The Hamming distance algorithm simply counts how many characters are different in equal length strings.

let hammingDistance (source : string) (target : string) =
  if source.Length <> target.Length then failwith "Strings must be equal length"
 
  Array.zip (source.ToCharArray()) (target.ToCharArray())
  |> Array.fold (fun acc (x, y) -> acc + (if x = y then 0 else 1)) 0

hammingDistance "abcde" "abcde" |> printfn "%i" // 0
hammingDistance "abcde" "abcdz" |> printfn "%i" // 1
hammingDistance "abcde" "abcyz" |> printfn "%i" // 2
hammingDistance "abcde" "abxyz" |> printfn "%i" // 3
hammingDistance "abcde" "awxyz" |> printfn "%i" // 4
hammingDistance "abcde" "vwxyz" |> printfn "%i" // 5

Levenshtein Distance

The Levenshtein distance algorithm counts how many characters are different between any two strings taking into account character insertions and deletions.

open System
let levenshteinDistance (source : string) (target : string) =
  let wordLengths = (source.Length, target.Length)
  let matrix = Array2D.create ((fst wordLengths) + 1) ((snd wordLengths) + 1) 0

  for c1 = 0 to (fst wordLengths) do
    for c2 = 0 to (snd wordLengths) do
      matrix.[c1, c2] <-
        match (c1, c2) with
        | h, 0 -> h
        | 0, w -> w
        | h, w ->
          let sourceChar, targetChar = source.[h - 1], target.[w - 1]
          let cost = if sourceChar = targetChar then 0 else 1
          let insertion = matrix.[h, w - 1] + 1
          let deletion = matrix.[h - 1, w] + 1
          let subst = matrix.[h - 1, w - 1] + cost
          Math.Min(insertion, Math.Min(deletion, subst))
  matrix.[fst wordLengths, snd wordLengths]

levenshteinDistance "abcde" "abcde" |> printfn "%i" // 0
levenshteinDistance "abcde" "abcdz" |> printfn "%i" // 1
levenshteinDistance "abcde" "abcyz" |> printfn "%i" // 2
levenshteinDistance "abcde" "abxyz" |> printfn "%i" // 3
levenshteinDistance "abcde" "awxyz" |> printfn "%i" // 4
levenshteinDistance "abcde" "vwxyz" |> printfn "%i" // 5
levenshteinDistance "abcdefg" "abcde" |> printfn "%i" // 2
levenshteinDistance "abcdefg" "zxyde" |> printfn "%i" // 5
levenshteinDistance "abcde" "bacde" |> printfn "%i" // 2
levenshteinDistance "abcde" "baced" |> printfn "%i" // 4

Damerau-Levenshtein Distance

The Damerau-Levenshtein algorithm builds upon the Levenshtein algorithm by taking transpositions into account.

open System
let damerauLevenshteinDistance (source : string) (target : string) =
  let wordLengths = (source.Length, target.Length)
  let matrix = Array2D.create ((fst wordLengths) + 1) ((snd wordLengths) + 1) 0

  for h = 0 to (fst wordLengths) do
    for w = 0 to (snd wordLengths) do
      matrix.[h, w] <-
        match (h, w) with
        | x, 0 -> x
        | 0, y -> y
        | x, y ->
          let sourceChar, targetChar = source.[x - 1], target.[y - 1]
          let cost = if sourceChar = targetChar then 0 else 1
          let insertion = matrix.[x, y - 1] + 1
          let deletion = matrix.[x - 1, y] + 1
          let subst = matrix.[x - 1, y - 1] + cost

          let distance = Math.Min(insertion, Math.Min(deletion, subst))

          if x > 1 && w > 1 && sourceChar = target.[y - 2] && source.[x - 2] = targetChar then
            Math.Min(distance, matrix.[x - 2, y - 2] + cost)
          else
            distance
  matrix.[fst wordLengths, snd wordLengths]

damerauLevenshteinDistance "abcde" "abcde" |> printfn "%i" // 0
damerauLevenshteinDistance "abcde" "abcdz" |> printfn "%i" // 1
damerauLevenshteinDistance "abcde" "abcyz" |> printfn "%i" // 2
damerauLevenshteinDistance "abcde" "abxyz" |> printfn "%i" // 3
damerauLevenshteinDistance "abcde" "awxyz" |> printfn "%i" // 4
damerauLevenshteinDistance "abcde" "vwxyz" |> printfn "%i" // 5
damerauLevenshteinDistance "abcdefg" "abcde" |> printfn "%i" // 2
damerauLevenshteinDistance "abcdefg" "zxyde" |> printfn "%i" // 5
// Transpositions
damerauLevenshteinDistance "abcde" "bacde" |> printfn "%i" // 1
damerauLevenshteinDistance "abcde" "baced" |> printfn "%i" // 2