Finding Similar Strings (LCS + Levenshtein Distance)

Consider an algorithmic problem:

The task

Given a set of strings $$S_1, S_2, …, S_n$$ (in the general case of different lengths) consisting of lowercase Latin letters.

Given a string T (user request), also consisting of lowercase Latin letters.

Each line $Si(i=1,…,n)$ contains at most k words.

The string $T$ also contains at most $k$ words.

Words are separated by spaces.

Need to find “most similar” to the user’s request a string from the set $S_1, S_2, …, S_n$.

We will take into account that:

  1. The user could swap the words.

  2. The user could enter only a part (subsequence) of the string that he wanted to find in the set.

  3. The user could make typos: skipping letters, writing extra letters, replacing letters with others.

Solution

Let’s consider the simplest solution.

Let’s go through all the permutations of the query words T.

For each permutation of words $T_permutation$, let’s iterate over all rows of $S_i$, and run the algorithm $LCS(T_permutation,S_i)$.

LCS – longest common subsequence.

From all rows S_1,S_2,…,S_n, we select those rows for which |LCS(T_permutation,S_i)| – maximum. Also remember for which permutation $T_permutation(saved)$ this maximum LCS value was obtained. Denote the resulting set of strings as $L$.

Now let’s calculate the Levenshtein distance between the corresponding $L_i$ and $T_permutation(saved)$.

We choose the only row $L_ans$ for which the Levenshtein distance with the corresponding $T_permutation(saved)$ is minimal. This line will be the answer.

Java code:

public int F(char x, char y)
{
    if (x == y) return 0;
    else return 1;
}

public int GetDist(String a, String b)
{
    int[][] D = new int[100][100];
    D[0][0] = 0;
    for (int i = 1; i < a.length(); i++)
    {
        D[i][0] = i;
    }
    for (int j = 1; j < b.length(); j++)
    {
        D[0][j] = j;
    }
    for (int i = 1; i < a.length(); i++)
    {
        for (int j = 1; j < b.length(); j++)
        {
            int m = F(a.charAt(i), b.charAt(j));
            int val1 = D[i][j-1] + 1;
            int val2 = D[i-1][j] + 1;
            int val3 = D[i-1][j-1] + m;

            int minv = Math.min(val1, val2);
            minv = Math.min(minv, val3);
            D[i][j] = minv;
        }
    }
    return D[a.length()-1][b.length()-1];
}



public String[] swap(String data[], int left, int right)
{
    // Swap the data
    String temp = data[left];
    data[left] = data[right];
    data[right] = temp;

    // Return the updated array
    return data;
}


public int Compare(String a, String b)
{
    for (int i = 0; i < Math.min(a.length(), b.length()); i++)
    {
        if (a.charAt(i) < b.charAt(i))
        {
            return -1;
        }
        if (a.charAt(i) > b.charAt(i))
        {
            return 1;
        }
    }
    if (a.length() < b.length()) return -1;
    if (a.length() > b.length()) return 1;
    return 0;
}

public String[] findNextPermutation(String p[])
{
    for (int a = p.length - 2; a >= 0; --a)
    {
        if (Compare(p[a], p[a+1]) == -1)
        {
            for (int b = p.length-1; ; --b)
            {
                if (Compare(p[b], p[a]) == 1)
                {
                    String t = p[a];
                    p[a] = p[b];
                    p[b] = t;
                    for (++a, b = p.length-1; a < b; ++a, --b)
                    {
                        t = p[a];
                        p[a] = p[b];
                        p[b] = t;
                    }
                    return p;
                }

            }
        }
    }
    return p;
}

public int Factorial(int n)
{
    int fact = 1;
    for (int i = 2; i <= n; i++)
    {
        fact *= i;
    }
    return fact;
}

public int GetLenMaxSubSeq(String a, String b)
{
    int[][] dp = new int[100][100];

    dp[0][0] = 0;
    for (int i = 1; i <= a.length(); i++)
    {
        dp[i][0] = 0;
    }

    for (int j = 1; j <= b.length(); j++)
    {
        dp[0][j] = 0;
    }

    for (int i = 1; i <= a.length(); i++)
    {
        for (int j = 1; j <= b.length(); j++)
        {
            if (a.charAt(i-1) == b.charAt(j-1))
            {
                dp[i][j] = dp[i-1][j-1] + 1;
            }
            else dp[i][j] = Math.max(dp[i-1][j], dp[i][j-1]);
        }
    }

    int maxv = 0;
    for (int i = 0; i <= a.length(); i++)
    {
        for (int j = 0; j <= b.length(); j++)
        {
            if (dp[i][j] > maxv)
            {
                maxv = dp[i][j];
            }
        }
    }
    return maxv;
}

public Pair PermutateSubSeq(String a, String b)
{
    int max_len = 0;

    String[] b_arr = b.split("\\s+");
    Arrays.sort(b_arr);

    String res = "";

    for (int cur_permutation = 0; cur_permutation < Factorial(b_arr.length); cur_permutation++)
    {
        int cur_len = GetLenMaxSubSeq(a, b);

        if (cur_len > max_len)
        {
            max_len = cur_len;
            res = b;
        }

        b_arr = findNextPermutation(b_arr);
        b = "";
        for (int i = 0; i < b_arr.length-1; i++)
        {
            b += b_arr[i] + " ";
        }
        b += b_arr[b_arr.length-1];
    }

    Pair pair = new Pair();
    pair.Val = max_len;
    pair.str = res;

    return pair;
}

public int GetIndex(String name, int N)
{
    int index = 0;
    int maxlensubseq = 0;
    int min_dist = 10000;

    for (int i = 0; i < N; i++)
    {
        Pair p = PermutateSubSeq(arr[i], name);
        int cur_subseq_len = p.Val;
        String str = p.str;

        if (cur_subseq_len > maxlensubseq)
        {
            maxlensubseq = cur_subseq_len;
            index = i;
        }

        if (cur_subseq_len == maxlensubseq)
        {
            int cur_dist = GetDist(arr[i], str);
            if (cur_dist < min_dist)
            {
                min_dist = cur_dist;
                index = i;
            }
        }
    }
    return index;
}

Similar Posts

Leave a Reply

Your email address will not be published. Required fields are marked *