Author Topic: sefun for fuzzy matching strings  (Read 1828 times)

Offline Tricky

  • BFF
  • ***
  • Posts: 189
  • I like what I code and I code what I like!
    • View Profile
sefun for fuzzy matching strings
« on: November 21, 2006, 12:27:37 PM »
This was originally an efun within LPC4. I liked it and changed it some years ago into Perl. Then 4 months ago I recreated it in LPC.



Mudlib: Dead Souls 2.1.1

Driver: MudOS v22.2b14



/secure/sefun/fuzzymatch.c

LPC code:

/*    /secure/sefun/fuzzymatch.c
 *    sefuns for fuzzy string matching
 *    created by Tricky @ Rock the Halo, 21st November, 2006
 */

/* Fuzzymatch code:
 * original code by Stig S�ther Bakken (Auronthas) 940715
 * Less original code: Profezzorn
 * Perlified: Richard Harrison (Tricky)
 * LPC'd: Richard Harrison (Tricky) - 2006-07-20
 */

/* The fuzziness between two strings, STRING1 and STRING2, is calculated by
 * finding the position in STRING2 of a prefix of STRING1.  The first character
 * of STRING1 is found in STRING2.  If we find it, we continue matching
 * successive characters from STRING1 at successive STRING2 positions.  When we
 * have found the longest prefix of STRING1 in STRING2, we decide whether it is
 * a match.  It is considered a match if the length of the prefix is greater or
 * equal to the offset of the beginning of the prefix of STRING1 in STRING2.
 * This means that "food" will match "barfoo" because "foo" (the prefix)
 * matches "foo" in "barfoo" with an offset and length of 3.  However, "food"
 * will not be considered to match "barfu", since the length is 1 while the
 * offset is 3.  The fuzz value of the match is the length of the prefix.  If
 * we find a match, we take the prefix off STRING1 and the string upto the end
 * of the match in STRING2.  If we do not find a match, we take off the first
 * character in STRING1.  Then we try and find the next prefix.
 *
 * So, to walk through an example:
 *
 * (FM-matchiness "pigface" "pigsfly"):
 *
 * STRING1        STRING2         MATCH LENGTH  OFFSET  FUZZ
 * pigface        pigsfly         3             0       3
 * face           sfly            1             1       1
 * ace            ly              0             0       0
 * ce             ly              0             0       0
 * c              ly              0             0       0
 *
 *      => 4 or 57.14%
 *
 * (FM-matchiness "begining-of-l" "beginning-of-l"):
 *
 * STRING1        STRING2         MATCH LENGTH  OFFSET  FUZZ
 * begining-of-l  beginning-of-l  5             0       5
 * ing-of-l       ning-of-l       8             1       8
 *
 *      => 13 or 92.59%
 */

static int f_fuzzymatch(string str1, string str2)
{
    int fuzz = 0;

    while(str1 != "" && str2 != "")
    {
        /* Now we will look for the first character of tmp1 in tmp2 */
        int offset = strsrch(str2, str1[0..0]);
        int str1len = strlen(str1);

        if(offset != -1 && str1len >= offset)
        {
            string *str1tmp = explode(str1, "");
            string *str2tmp = explode(str2[offset..<1], "");
            int len = 0;
            int str2len = strlen(str2[offset..<1]);

            /* Ok, so we have found one character, let's check how many more */
            while(str1tmp[len] == str2tmp[len])
            {
                len++;

                if(len >= str1len || len >= str2len)
                    break;

            }

            /* We consider this a hit only if the length of the match is */
            /* bigger than or equal to the offset */
            if(len >= offset)
            {
                fuzz += len;
                str1 = str1[len..<1];
                str2 = str2[len + offset..<1];

                continue;
            }

        }

        str1 = str1[1..<1];
    }

    return fuzz;
}

/* This is how we use it. */
float fuzzymatch(string str1, string str2)
{
    float fuzz;
    int l1 = strlen(str1);
    int l2 = strlen(str2);
    int t1 = l1 >= l2 ? l1 : l2;

    /* Pad out the shortest string to the length of the longest string */
    str1 = sprintf("%-*s", t1, str1);
    str2 = sprintf("%-*s", t1, str2);

    if(l1 == l2 && str1 == str2)
        fuzz = 100.0;
    else
    {
        fuzz = f_fuzzymatch(str1, str2);
        fuzz += f_fuzzymatch(str2, str1);
        fuzz = fuzz * 100.0 / (float)(l1 + l2);
    }

    return fuzz;
}




/doc/sefun/fuzzymatch

doc code:


FUZZYMATCH(1)                                       FUZZYMATCH(1)

NAME
       fuzzymatch() - returns the fuzziness between two strings.

SYNOPSIS
       float fuzzymatch(string str1, string str2)

DESCRIPTION
       The fuzziness between two strings, STRING1 and STRING2, is
       calculated by finding the position in STRING2 of a prefix
       of STRING1.  The first character of STRING1 is found in
       STRING2.  If we find it, we continue matching successive
       characters from STRING1 at successive STRING2 positions.
       When we have found the longest prefix of STRING1 in
       STRING2, we decide whether it is a match.  It is
       considered a match if the length of the prefix is greater
       or equal to the offset of the beginning of the prefix of
       STRING1 in STRING2.  This means that "food" will match
       "barfoo" because "foo" (the prefix) matches "foo" in
       "barfoo" with an offset and length of 3.  However, "food"
       will not be considered to match "barfu", since the length
       is 1 while the offset is 3.  The fuzz value of the match
       is the length of the prefix.  If we find a match, we take
       the prefix off STRING1 and the string upto the end of the
       match in STRING2.  If we do not find a match, we take off
       the first character in STRING1.  Then we try and find the
       next prefix.

       So, to walk through an example:

       (FM-matchiness "pigface" "pigsfly"):

       STRING1        STRING2         MATCH LENGTH  OFFSET  FUZZ
       pigface        pigsfly         3             0       3
       face           sfly            1             1       1
       ace            ly              0             0       0
       ce             ly              0             0       0
       c              ly              0             0       0

            => 4 or 57.14%

       (FM-matchiness "begining-of-l" "beginning-of-l"):

       STRING1        STRING2         MATCH LENGTH  OFFSET  FUZZ
       begining-of-l  beginning-of-l  5             0       5
       ing-of-l       ning-of-l       8             1       8

            => 13 or 92.59%

EXAMPLES
       fuzz = fuzzymatch("pigface", "pigsfly")
       fuzz = fuzzymatch("begining-of-l", "beginning-of-l")

LOCATION
       /secure/sefun/fuzzymatch.c

AUTHOR
       Tricky @ Rock the Halo

                                                                1


I haven't checked it, but it should work fine. (Refer to a more knowledgeble person on how to integrate it into the sefun system.)



Tricky