Search This Blog

Wednesday, March 19, 2014

Fuzzy Word Comparision in MySQL

As the topic has recently come up in a Usenet post, I've rummaged through my MySQL playground database and dug out an implementation of the Levenshtein distance algorithm as a MySQL stored function.

Levenshtein Distance

 The Levenshtein distance is a measure for how much two words differ as a sum of changes to change one into the other. This means a distance of zero means that the two words are identical, and every character that needs to be added, removed or changed increases the difference by one. The first of the stored functions below, named LSD (which should by no means be taken as an endorsement to take drugs) returns the distance as an integer. The code is an adaption of the optimized iterative approach that can be found in the Wikipedia article on the Levenshtein algorithm.

Usage example

SELECT LSD('kitchen', 'kitten');
>> 2

To get from 'kitchen' to 'kitten', one needs two steps:
  • change the 'c' to 't'
  • remove the 'h'

Levenshtein Ratio

As a distance of 2 may have completely different meanings for long words of fifteen characters opposed two comparing three-character words, the distance on its own doesn't tell enough about the similarity of two words. That's what the LSD_RATIO function below is for, it calculates the percentual Levenshtein distance compared to the maximum word length.

SELECT LSD_RATIO('bad', 'bed');
>> 67

SELECT LSD_RATIO('beast', 'feast');
>> 80

MySQL DDL

DROP FUNCTION IF EXISTS LSD;

DELIMITER $$

CREATE FUNCTION LSD(inParamA VARCHAR(254), inParamB VARCHAR(254))
RETURNS INT
DETERMINISTIC
NO SQL
BEGIN
    DECLARE inpA VARCHAR(254);
    DECLARE inpB VARCHAR(254);
    DECLARE i INT DEFAULT 1;
    DECLARE j INT DEFAULT 1;
    DECLARE curMin INT DEFAULT 0;
    DECLARE cost INT DEFAULT 0;
    DECLARE v0 CHAR(64) DEFAULT REPEAT(CHAR(0), 255);
    DECLARE v1 CHAR(64) DEFAULT REPEAT(CHAR(0), 255);
    DECLARE aLen INT DEFAULT 0;
    DECLARE bLen INT DEFAULT 0;
   
    IF inParamA IS NULL OR inParamB IS NULL THEN
        RETURN NULL;
    END IF;
   
    IF inParamA = inParamB THEN
        RETURN 0;
    END IF;
   
    IF LENGTH(inParamA) = 0 THEN
        RETURN LENGTH(inParamB);
    END IF;
   
    IF LENGTH(inParamB) = 0 THEN
        RETURN LENGTH(inParamA);
    END IF;

    IF LENGTH(inParamA) < LENGTH(inParamB) THEN
        SET inpA = inParamA;
        SET inpB = inParamB;
    ELSE
        SET inpA = inParamB;
        SET inpB = inParamA;
    END IF;
   
    SET aLen = LENGTH(inpA);
    SET bLen = LENGTH(inpB);
   
    WHILE i <= bLen DO
        SET v0 = INSERT(v0, i, 1, CHAR(i-1));
        SET i = i + 1;
    END WHILE;
   
    SET i = 1;
    WHILE i <= aLen DO
        SET v1 = INSERT(v1, 1, CHAR(i), 1);
       
        SET j = 1;
       
        WHILE j <= bLen DO
            IF SUBSTRING(inpA, i, 1) = SUBSTRING(inpB, j, 1) THEN
                SET cost = 0;
            ELSE
                SET cost = 1;
            END IF;
            SET curMin = ORD(SUBSTRING(v1,j,1)) + 1;
            IF ORD(SUBSTRING(v0,j + 1,1)) + 1 < curMin THEN
                SET curMin = ORD(SUBSTRING(v0,j,1))  + 1;
            END IF;
            IF ORD(SUBSTRING(v0,j,1)) + cost < curMin THEN
                SET curMin = ORD(SUBSTRING(v0,j,1)) + cost;
            END IF;
            SET v1 = INSERT(v1, j + 1, 1, CHAR(curMin));
            SET j = j + 1;
        END WHILE;
       
        SET v0 = v1;
       
        SET i = i + 1;
    END WHILE;
   
    RETURN ORD(SUBSTRING(v1,bLen+1,1));
END
$$

DROP FUNCTION IF EXISTS LSD_RATIO
$$

CREATE FUNCTION LSD_RATIO(inParamA VARCHAR(254), inParamB VARCHAR(254))
RETURNS INT
DETERMINISTIC
NO SQL
BEGIN
    DECLARE maxLen INT DEFAULT 0;
    DECLARE lvDist INT;

    IF inParamA IS NULL OR inParamB IS NULL THEN
        return 0;
    END IF;

    SET maxLen = (CASE WHEN LENGTH(inParamA) > LENGTH(inParamB) THEN LENGTH(inParamA) ELSE LENGTH(inParamB) END);
    IF maxLen = 0 THEN
        return 0;
    SET lvDist = LSD(inParamA, inParamB);
    RETURN (maxLen - lvDist) / maxLen * 100;
END
$$

No comments: