Source code for datasketch.weighted_minhash

import collections
import copy
import numpy as np


[docs]class WeightedMinHash(object): '''New weighted MinHash is generated by :class:`datasketch.WeightedMinHashGenerator`. You can also initialize weighted MinHash by using the state from an existing one. Args: seed (int): The random seed used to generate this weighted MinHash. hashvalues: The internal state of this weighted MinHash. '''
[docs] def __init__(self, seed, hashvalues): self.seed = seed self.hashvalues = hashvalues
[docs] def jaccard(self, other): '''Estimate the `weighted Jaccard similarity`_ between the multi-sets represented by this weighted MinHash and the other. Args: other (datasketch.WeightedMinHash): The other weighted MinHash. Returns: float: The weighted Jaccard similarity between 0.0 and 1.0. .. _`weighted Jaccard similarity`: http://mathoverflow.net/questions/123339/weighted-jaccard-similarity ''' if other.seed != self.seed: raise ValueError("Cannot compute Jaccard given WeightedMinHash objects with\ different seeds") if len(self) != len(other): raise ValueError("Cannot compute Jaccard given WeightedMinHash objects with\ different numbers of hash values") # Check how many pairs of (k, t) hashvalues are equal intersection = 0 for this, that in zip(self.hashvalues, other.hashvalues): if np.array_equal(this, that): intersection += 1 return float(intersection) / float(len(self))
[docs] def digest(self): '''Export the hash values, which is the internal state of the weighted MinHash. Returns: numpy.array: The hash values which is a Numpy array. ''' return copy.copy(self.hashvalues)
[docs] def copy(self): ''' Returns: datasketch.WeightedMinHash: A copy of this weighted MinHash by exporting its state. ''' return WeightedMinHash(self.seed, self.digest())
[docs] def __len__(self): ''' Returns: int: The number of hash values. ''' return len(self.hashvalues)
[docs] def __eq__(self, other): ''' Returns: bool: If their seeds and hash values are both equal then two are equivalent. ''' return type(self) is type(other) and \ self.seed == other.seed and \ np.array_equal(self.hashvalues, other.hashvalues)
[docs]class WeightedMinHashGenerator(object): '''The weighted MinHash generator is used for creating new :class:`datasketch.WeightedMinHash` objects. This weighted MinHash implementation is based on Sergey Ioffe's paper, `Improved Consistent Sampling, Weighted Minhash and L1 Sketching <http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf>`_ Args: dim (int): The number of dimensions of the input Jaccard vectors. sample_size (int, optional): The number of samples to use for creating weighted MinHash. seed (int): The random seed to use for generating permutation functions. '''
[docs] def __init__(self, dim, sample_size=128, seed=1): self.dim = dim self.sample_size = sample_size self.seed = seed generator = np.random.RandomState(seed=seed) self.rs = generator.gamma(2, 1, (sample_size, dim)).astype(np.float32) self.ln_cs = np.log(generator.gamma(2, 1, (sample_size, dim))).astype(np.float32) self.betas = generator.uniform(0, 1, (sample_size, dim)).astype(np.float32)
[docs] def minhash(self, v): '''Create a new weighted MinHash given a weighted Jaccard vector. Each dimension is an integer frequency of the corresponding element in the multi-set represented by the vector. Args: v (numpy.array): The Jaccard vector. ''' if not isinstance(v, collections.Iterable): raise TypeError("Input vector must be an iterable") if not len(v) == self.dim: raise ValueError("Input dimension mismatch, expecting %d" % self.dim) if not isinstance(v, np.ndarray): v = np.array(v, dtype=np.float32) elif v.dtype != np.float32: v = v.astype(np.float32) hashvalues = np.zeros((self.sample_size, 2), dtype=np.int) vzeros = (v == 0) if vzeros.all(): raise ValueError("Input is all zeros") v[vzeros] = np.nan vlog = np.log(v) for i in range(self.sample_size): t = np.floor((vlog / self.rs[i]) + self.betas[i]) ln_y = (t - self.betas[i]) * self.rs[i] ln_a = self.ln_cs[i] - ln_y - self.rs[i] k = np.nanargmin(ln_a) hashvalues[i][0], hashvalues[i][1] = k, int(t[k]) return WeightedMinHash(self.seed, hashvalues)