Levenshtein Distance
The Levenshtein distance is a measure for how much two words differ as a sum of changes to change one into the other. This means a distance of zero means that the two words are identical, and every character that needs to be added, removed or changed increases the difference by one. The first of the stored functions below, named LSD (which should by no means be taken as an endorsement to take drugs) returns the distance as an integer. The code is an adaption of the optimized iterative approach that can be found in the Wikipedia article on the Levenshtein algorithm.Usage example
SELECT LSD('kitchen', 'kitten');>> 2
To get from 'kitchen' to 'kitten', one needs two steps:
- change the 'c' to 't'
- remove the 'h'
Levenshtein Ratio
As a distance of 2 may have completely different meanings for long words of fifteen characters opposed two comparing three-character words, the distance on its own doesn't tell enough about the similarity of two words. That's what the LSD_RATIO function below is for, it calculates the percentual Levenshtein distance compared to the maximum word length.SELECT LSD_RATIO('bad', 'bed');
>> 67
SELECT LSD_RATIO('beast', 'feast');
>> 80
MySQL DDL
DROP FUNCTION IF EXISTS LSD;DELIMITER $$
CREATE FUNCTION LSD(inParamA VARCHAR(254), inParamB VARCHAR(254))
RETURNS INT
DETERMINISTIC
NO SQL
BEGIN
DECLARE inpA VARCHAR(254);
DECLARE inpB VARCHAR(254);
DECLARE i INT DEFAULT 1;
DECLARE j INT DEFAULT 1;
DECLARE curMin INT DEFAULT 0;
DECLARE cost INT DEFAULT 0;
DECLARE v0 CHAR(64) DEFAULT REPEAT(CHAR(0), 255);
DECLARE v1 CHAR(64) DEFAULT REPEAT(CHAR(0), 255);
DECLARE aLen INT DEFAULT 0;
DECLARE bLen INT DEFAULT 0;
IF inParamA IS NULL OR inParamB IS NULL THEN
RETURN NULL;
END IF;
IF inParamA = inParamB THEN
RETURN 0;
END IF;
IF LENGTH(inParamA) = 0 THEN
RETURN LENGTH(inParamB);
END IF;
IF LENGTH(inParamB) = 0 THEN
RETURN LENGTH(inParamA);
END IF;
IF LENGTH(inParamA) < LENGTH(inParamB) THEN
SET inpA = inParamA;
SET inpB = inParamB;
ELSE
SET inpA = inParamB;
SET inpB = inParamA;
END IF;
SET aLen = LENGTH(inpA);
SET bLen = LENGTH(inpB);
WHILE i <= bLen DO
SET v0 = INSERT(v0, i, 1, CHAR(i-1));
SET i = i + 1;
END WHILE;
SET i = 1;
WHILE i <= aLen DO
SET v1 = INSERT(v1, 1, CHAR(i), 1);
SET j = 1;
WHILE j <= bLen DO
IF SUBSTRING(inpA, i, 1) = SUBSTRING(inpB, j, 1) THEN
SET cost = 0;
ELSE
SET cost = 1;
END IF;
SET curMin = ORD(SUBSTRING(v1,j,1)) + 1;
IF ORD(SUBSTRING(v0,j + 1,1)) + 1 < curMin THEN
SET curMin = ORD(SUBSTRING(v0,j,1)) + 1;
END IF;
IF ORD(SUBSTRING(v0,j,1)) + cost < curMin THEN
SET curMin = ORD(SUBSTRING(v0,j,1)) + cost;
END IF;
SET v1 = INSERT(v1, j + 1, 1, CHAR(curMin));
SET j = j + 1;
END WHILE;
SET v0 = v1;
SET i = i + 1;
END WHILE;
RETURN ORD(SUBSTRING(v1,bLen+1,1));
END
$$
DROP FUNCTION IF EXISTS LSD_RATIO
$$
CREATE FUNCTION LSD_RATIO(inParamA VARCHAR(254), inParamB VARCHAR(254))
RETURNS INT
DETERMINISTIC
NO SQL
BEGIN
DECLARE maxLen INT DEFAULT 0;
DECLARE lvDist INT;
IF inParamA IS NULL OR inParamB IS NULL THEN
return 0;
END IF;
SET maxLen = (CASE WHEN LENGTH(inParamA) > LENGTH(inParamB) THEN LENGTH(inParamA) ELSE LENGTH(inParamB) END);
IF maxLen = 0 THEN
return 0;
SET lvDist = LSD(inParamA, inParamB);
RETURN (maxLen - lvDist) / maxLen * 100;
END
$$
No comments:
Post a Comment