A handy collection of algorithms dealing with fuzzy strings and phonetics.
Consider this library deprecated for JavaScript.
Indeed, the Talisman library can be seen as an improvement over clj-fuzzy
and is, what’s more, written directly in JavaScript.
To install the latest node version of the library just run
npm install clj-fuzzy
If you want the latest development version
npm install git+https://github.com/Yomguithereal/clj-fuzzy.git
Finally, if you need to include the library in your node.js project, add the following dependency to your package.json file.
{
"dependencies": {
"clj-fuzzy": "0.4.1"
}
}
The npm version already works with browserify
and any other tools enabling you to use require
for client-side assets.
clj-fuzzy ships with three API namespaces: clj_fuzzy.metrics
, clj_fuzzy.stemmers
and finally clj_fuzzy.phonetics
, embarking the relevant algorithms.
Just require the library and you are good to go.
var clj_fuzzy = require('clj-fuzzy');
// Compute the Dice coefficient of two words
clj_fuzzy.metrics.dice('healed', 'sealed');
0.8
clj_fuzzy.metrics.dice('healed', 'herded');
0.4
// There is also a Sorensen alias
clj_fuzzy.metrics.sorensen('healed', 'herded');
0.4
// Compute the Levenshtein distance between two words
clj_fuzzy.metrics.levenshtein('book', 'back');
2
clj_fuzzy.metrics.levenshtein('hello', 'helo');
1
// Compute the Hamming distance between two words
clj_fuzzy.metrics.hamming('ramer', 'cases');
3
clj_fuzzy.metrics.hamming([0, 1, 0, 1], [1, 1, 0, 1]);
1
// Compute the Jaccard distance between two words
// 0 meaning two identical strings and 1 two totally different ones
clj_fuzzy.metrics.jaccard('abc', 'xyz');
1
clj_fuzzy.metrics.jaccard('night', 'nacht');
0.5714285714285714
// If you are more the Tanimoto kind of guy, an alias exists
clj_fuzzy.metrics.tanimoto('night', 'nacht');
0.5714285714285714
// Compute the Jaro distance between two words
clj_fuzzy.metrics.jaro('Dwayne', 'Duane');
0.8222222222222223
// Compute the Jaro-Winkler distance between two words
clj_fuzzy.metrics.jaro_winkler('Dwayne', 'Duane');
0.8400000000000001
// Compare two string using the Match Rating Approach
clj_fuzzy.metrics.mra_comparison('Byrne', 'Boern');
>>> {
minimum: 4,
similarity: 5,
code: ['BYRN', 'BRN'],
match: true
}
// Compute the Tversky index of two sequences.
clj_fuzzy.metrics.tversky('night', 'nacht');
0.42857142857142855
// Compute the same index for a precise alpha and beta value
// Default value is alpha = beta = 1 and produces the Jaccard coefficient
// alpha = beta = 0.5 produces the Dice coefficient (without bigrams)
clj_fuzzy.metrics.tversky('healed', 'sealed', {alpha: 0.5, beta: 0.5});
0.8
// You can also specify whether you want to compute the
// symmetric variant of the index
clj_fuzzy.metrics.tversky(
'healed',
'sealed',
{
alpha: 1,
beta: 1,
symmetric: true
}
);
0.8
// Compute the stem of a word
clj_fuzzy.stemmers.lancaster('worker');
'work'
clj_fuzzy.stemmers.lancaster('presumably');
'presum'
// Compute the stem of a word
clj_fuzzy.stemmers.lovins('nationality');
'nat'
clj_fuzzy.stemmers.lovins('analytic');
'analys'
// Compute the stem of a word
clj_fuzzy.stemmers.porter('adjective');
'adject'
clj_fuzzy.stemmers.porter('building');
'build'
// Compute the stem of a word
clj_fuzzy.stemmers.schinke('aquila');
>>> {noun: 'aquil', verb: 'aquila'}
clj_fuzzy.stemmers.schinke('apparebunt');
>>> {noun: 'apparebu', verb: 'apparebi'}
// Compute the metaphone code for a single word
clj_fuzzy.phonetics.metaphone('hypocrite');
"HPKRT"
clj_fuzzy.phonetics.metaphone('discrimination');
"TSKRMNXN"
// Compute the double metaphone of a word
clj_fuzzy.phonetics.double_metaphone('Smith');
["SM0" "XMT"]
clj_fuzzy.phonetics.double_metaphone('Schmidt');
["XMT" "SMT"]
// Compute the soundex code of a single name
clj_fuzzy.phonetics.soundex('Ashcroft');
"A261"
clj_fuzzy.phonetics.soundex('Andrew');
"A536"
// Compute the NYSIIS code of a single name
clj_fuzzy.phonetics.nysiis('Andrew');
"ANDR"
clj_fuzzy.phonetics.nysiis('Mclaughlin');
"MCLAGLAN"
// Compute the refined NYSIIS code of a single name
clj_fuzzy.phonetics.nysiis('Aegir', 'refined');
"AGAR"
// Compute the caverphone code of a single name
clj_fuzzy.phonetics.caverphone('Henrichsen');
"ANRKSN1111"
clj_fuzzy.phonetics.caverphone('Mclaverty');
"MKLFTA1111"
// Compute the "revisited" caverphone code of a single name
clj_fuzzy.phonetics.caverphone('Stevenson', 'revisited');
"STFNSN1111"
// Compute the cologne phonetic code of a single word
clj_fuzzy.phonetics.cologne('Müller-Lüdenscheidt');
"65752682"
clj_fuzzy.phonetics.cologne('Breschnew');
"17863"
// Compute the MRA codex of a single name
clj_fuzzy.phonetics.mra_codex('Catherine');
"CTHRN"
clj_fuzzy.phonetics.mra_codex('Smith');
"SMTH"