clj-fuzzy

A handy collection of algorithms dealing with fuzzy strings and phonetics.

Node.js


~Deprecation warning~

Consider this library deprecated for JavaScript.

Indeed, the Talisman library can be seen as an improvement over clj-fuzzy and is, what’s more, written directly in JavaScript.

Installation

To install the latest node version of the library just run

npm install clj-fuzzy

If you want the latest development version

npm install git+https://github.com/Yomguithereal/clj-fuzzy.git

Finally, if you need to include the library in your node.js project, add the following dependency to your package.json file.

{
  "dependencies": {
    "clj-fuzzy": "0.4.1"
  }
}

The npm version already works with browserify and any other tools enabling you to use require for client-side assets.


Usage

clj-fuzzy ships with three API namespaces: clj_fuzzy.metrics, clj_fuzzy.stemmers and finally clj_fuzzy.phonetics, embarking the relevant algorithms.

Just require the library and you are good to go.

var clj_fuzzy = require('clj-fuzzy');

clj-fuzzy.metrics

clj-fuzzy.stemmers

clj-fuzzy.phonetics


clj-fuzzy.metrics

Sorensen / Dice coefficient

// Compute the Dice coefficient of two words
clj_fuzzy.metrics.dice('healed', 'sealed');
0.8

clj_fuzzy.metrics.dice('healed', 'herded');
0.4

// There is also a Sorensen alias
clj_fuzzy.metrics.sorensen('healed', 'herded');
0.4

Levenshtein distance

// Compute the Levenshtein distance between two words
clj_fuzzy.metrics.levenshtein('book', 'back');
2

clj_fuzzy.metrics.levenshtein('hello', 'helo');
1

Hamming distance

// Compute the Hamming distance between two words
clj_fuzzy.metrics.hamming('ramer', 'cases');
3

clj_fuzzy.metrics.hamming([0, 1, 0, 1], [1, 1, 0, 1]);
1

Jaccard / Tanimoto distance

// Compute the Jaccard distance between two words
// 0 meaning two identical strings and 1 two totally different ones
clj_fuzzy.metrics.jaccard('abc', 'xyz');
1

clj_fuzzy.metrics.jaccard('night', 'nacht');
0.5714285714285714

// If you are more the Tanimoto kind of guy, an alias exists
clj_fuzzy.metrics.tanimoto('night', 'nacht');
0.5714285714285714

Jaro-Winkler distance

// Compute the Jaro distance between two words
clj_fuzzy.metrics.jaro('Dwayne', 'Duane');
0.8222222222222223

// Compute the Jaro-Winkler distance between two words
clj_fuzzy.metrics.jaro_winkler('Dwayne', 'Duane');
0.8400000000000001

MRA Comparison

// Compare two string using the Match Rating Approach
clj_fuzzy.metrics.mra_comparison('Byrne', 'Boern');
>>> {
  minimum: 4,
  similarity: 5,
  code: ['BYRN', 'BRN'],
  match: true
}

Tversky Index

// Compute the Tversky index of two sequences.
clj_fuzzy.metrics.tversky('night', 'nacht');
0.42857142857142855

// Compute the same index for a precise alpha and beta value
// Default value is alpha = beta = 1 and produces the Jaccard coefficient
// alpha = beta = 0.5 produces the Dice coefficient (without bigrams)
clj_fuzzy.metrics.tversky('healed', 'sealed', {alpha: 0.5, beta: 0.5});
0.8

// You can also specify whether you want to compute the
// symmetric variant of the index
clj_fuzzy.metrics.tversky(
  'healed',
  'sealed',
  {
    alpha: 1,
    beta: 1,
    symmetric: true
  }
);
0.8

clj-fuzzy.stemmers

Lancaster stemmer

// Compute the stem of a word
clj_fuzzy.stemmers.lancaster('worker');
'work'

clj_fuzzy.stemmers.lancaster('presumably');
'presum'

Lovins stemmer

// Compute the stem of a word
clj_fuzzy.stemmers.lovins('nationality');
'nat'

clj_fuzzy.stemmers.lovins('analytic');
'analys'

Porter stemmer

// Compute the stem of a word
clj_fuzzy.stemmers.porter('adjective');
'adject'

clj_fuzzy.stemmers.porter('building');
'build'

Schinke stemmer

// Compute the stem of a word
clj_fuzzy.stemmers.schinke('aquila');
>>> {noun: 'aquil', verb: 'aquila'}

clj_fuzzy.stemmers.schinke('apparebunt');
>>> {noun: 'apparebu', verb: 'apparebi'}

clj-fuzzy.phonetics

Metaphone

// Compute the metaphone code for a single word
clj_fuzzy.phonetics.metaphone('hypocrite');
"HPKRT"

clj_fuzzy.phonetics.metaphone('discrimination');
"TSKRMNXN"

Double Metaphone

// Compute the double metaphone of a word
clj_fuzzy.phonetics.double_metaphone('Smith');
["SM0" "XMT"]

clj_fuzzy.phonetics.double_metaphone('Schmidt');
["XMT" "SMT"]

Soundex

// Compute the soundex code of a single name
clj_fuzzy.phonetics.soundex('Ashcroft');
"A261"

clj_fuzzy.phonetics.soundex('Andrew');
"A536"

NYSIIS

// Compute the NYSIIS code of a single name
clj_fuzzy.phonetics.nysiis('Andrew');
"ANDR"

clj_fuzzy.phonetics.nysiis('Mclaughlin');
"MCLAGLAN"

// Compute the refined NYSIIS code of a single name
clj_fuzzy.phonetics.nysiis('Aegir', 'refined');
"AGAR"

Caverphone

// Compute the caverphone code of a single name
clj_fuzzy.phonetics.caverphone('Henrichsen');
"ANRKSN1111"

clj_fuzzy.phonetics.caverphone('Mclaverty');
"MKLFTA1111"

// Compute the "revisited" caverphone code of a single name
clj_fuzzy.phonetics.caverphone('Stevenson', 'revisited');
"STFNSN1111"

Cologne Phonetic

// Compute the cologne phonetic code of a single word
clj_fuzzy.phonetics.cologne('Müller-Lüdenscheidt');
"65752682"

clj_fuzzy.phonetics.cologne('Breschnew');
"17863"

MRA Codex

// Compute the MRA codex of a single name
clj_fuzzy.phonetics.mra_codex('Catherine');
"CTHRN"

clj_fuzzy.phonetics.mra_codex('Smith');
"SMTH"