![]() |
|
|||
0001 // SPDX-License-Identifier: GPL-2.0 0002 #include "levenshtein.h" 0003 #include <errno.h> 0004 #include <stdlib.h> 0005 #include <string.h> 0006 0007 /* 0008 * This function implements the Damerau-Levenshtein algorithm to 0009 * calculate a distance between strings. 0010 * 0011 * Basically, it says how many letters need to be swapped, substituted, 0012 * deleted from, or added to string1, at least, to get string2. 0013 * 0014 * The idea is to build a distance matrix for the substrings of both 0015 * strings. To avoid a large space complexity, only the last three rows 0016 * are kept in memory (if swaps had the same or higher cost as one deletion 0017 * plus one insertion, only two rows would be needed). 0018 * 0019 * At any stage, "i + 1" denotes the length of the current substring of 0020 * string1 that the distance is calculated for. 0021 * 0022 * row2 holds the current row, row1 the previous row (i.e. for the substring 0023 * of string1 of length "i"), and row0 the row before that. 0024 * 0025 * In other words, at the start of the big loop, row2[j + 1] contains the 0026 * Damerau-Levenshtein distance between the substring of string1 of length 0027 * "i" and the substring of string2 of length "j + 1". 0028 * 0029 * All the big loop does is determine the partial minimum-cost paths. 0030 * 0031 * It does so by calculating the costs of the path ending in characters 0032 * i (in string1) and j (in string2), respectively, given that the last 0033 * operation is a substitution, a swap, a deletion, or an insertion. 0034 * 0035 * This implementation allows the costs to be weighted: 0036 * 0037 * - w (as in "sWap") 0038 * - s (as in "Substitution") 0039 * - a (for insertion, AKA "Add") 0040 * - d (as in "Deletion") 0041 * 0042 * Note that this algorithm calculates a distance _iff_ d == a. 0043 */ 0044 int levenshtein(const char *string1, const char *string2, 0045 int w, int s, int a, int d) 0046 { 0047 int len1 = strlen(string1), len2 = strlen(string2); 0048 int *row0 = malloc(sizeof(int) * (len2 + 1)); 0049 int *row1 = malloc(sizeof(int) * (len2 + 1)); 0050 int *row2 = malloc(sizeof(int) * (len2 + 1)); 0051 int i, j; 0052 0053 for (j = 0; j <= len2; j++) 0054 row1[j] = j * a; 0055 for (i = 0; i < len1; i++) { 0056 int *dummy; 0057 0058 row2[0] = (i + 1) * d; 0059 for (j = 0; j < len2; j++) { 0060 /* substitution */ 0061 row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); 0062 /* swap */ 0063 if (i > 0 && j > 0 && string1[i - 1] == string2[j] && 0064 string1[i] == string2[j - 1] && 0065 row2[j + 1] > row0[j - 1] + w) 0066 row2[j + 1] = row0[j - 1] + w; 0067 /* deletion */ 0068 if (row2[j + 1] > row1[j + 1] + d) 0069 row2[j + 1] = row1[j + 1] + d; 0070 /* insertion */ 0071 if (row2[j + 1] > row2[j] + a) 0072 row2[j + 1] = row2[j] + a; 0073 } 0074 0075 dummy = row0; 0076 row0 = row1; 0077 row1 = row2; 0078 row2 = dummy; 0079 } 0080 0081 i = row1[len2]; 0082 free(row0); 0083 free(row1); 0084 free(row2); 0085 0086 return i; 0087 }
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |