# -*- coding: utf-8 -*-
"""
Created on Mon Oct 20 11:49:24 2014
@author: gsubramanian
"""
import numpy as np
from collections import defaultdict
def col_wise_jaccard_sim(in_matrix):
rows,cols = in_matrix.shape
sim_list = defaultdict(float)
for col in range(cols):
for nxt_col in range(col+1,cols):
a = in_matrix[:,col]
b = in_matrix[:,nxt_col]
equal_count = 0
total = a.size + b.size
for r in range(rows):
if a[r,0] == b[r,0]:
equal_count+=1
key = str(col) + "-" + str(nxt_col)
sim_list[key] = equal_count/(1.0*total)
return sim_list
def getSigMatrix(char_matrix,hash_funcs):
"""
Returns a signature matrix
Args:
Input : char_matrix - characteristic matrix
"""
rows,cols = char_matrix.shape
no_hash = len(hash_funcs)
sig_matrix =np.full((no_hash ,cols),np.inf).reshape(no_hash,cols)
sig_matrix = np.matrix(sig_matrix)
for row in range(rows):
row_value = char_matrix[row,:]
h_value = []
for h_func in hash_funcs:
h_value.append(h_func(row))
for col in range(row_value.size):
if row_value[0,col] == 1:
for j in range(no_hash):
if sig_matrix[j,col] > h_value[j]:
sig_matrix[j,col] = h_value[j]
return sig_matrix
if __name__ == "__main__":
"""
Example 3.8 :
(a) h3(x) = x+1 mod 5
(b) h4(x) = 3x+1 mod 5
Element S1 S2 S3 S4
0 1 0 0 1
1 0 0 1 0
2 0 1 0 1
3 1 0 1 1
4 0 0 1 0
"""
hash_functions = []
hash_functions.append(lambda x:(x+1)%5)
hash_functions.append(lambda x:(3*x+1)%5)
char_matrix = np.matrix([
[1,0,0,1],
[0,0,1,0],
[0,1,0,1],
[1,0,1,1],
[0,0,1,0]
])
print getSigMatrix(char_matrix,hash_functions)
"""
Exercise 3.3.2 : Using the data from Fig. 3.4, add to the signatures of the
columns the values of the following hash functions:
(a) h3(x) = 2x + 4.
(b) h4(x) = 3x − 1.
Element S1 S2 S3 S4
0 0 1 0 1
1 0 1 0 0
2 1 0 0 1
3 0 0 1 0
4 0 0 1 1
5 1 0 0 0
"""
hash_functions = []
hash_functions.append(lambda x:(x+1)%5)
hash_functions.append(lambda x:(3*x+1)%5)
hash_functions.append(lambda x:(2*x+4)%5)
hash_functions.append(lambda x:(3*x-1)%5)
char_matrix = np.matrix([
[1,0,0,1],
[0,0,1,0],
[0,1,0,1],
[1,0,1,1],
[0,0,1,0]
])
print getSigMatrix(char_matrix,hash_functions)
"""
Exercise 3.3.3 : In Fig. 3.5 is a matrix with six rows.
(a) Compute the minhash signature for each column if we use the following
three hash functions: h1(x) = 2x + 1 mod 6; h2(x) = 3x + 2 mod 6;
h3(x) = 5x + 2 mod 6.
(b) Which of these hash functions are true permutations?
(c) How close are the estimated Jaccard similarities for the six pairs of columns
to the true Jaccard similarities?
"""
char_matrix = np.matrix([
[0,1,0,1],
[0,1,0,0],
[1,0,0,1],
[0,0,1,0],
[0,0,1,1],
[1,0,0,0]
])
hash_functions = []
hash_functions.append(lambda x:(2*x+1)%6)
hash_functions.append(lambda x:(3*x+2)%6)
hash_functions.append(lambda x:(5*x+2)%6)
hash_functions.append(lambda x:(6*x+2)%6)
sig_matrix = getSigMatrix(char_matrix,hash_functions)
# Min hash signatures
print sig_matrix
print
print "Jaccard similarity of characteristic matrix"
js_dict_char_matrix = col_wise_jaccard_sim(char_matrix)
for k,v in js_dict_char_matrix.items():
print k,v
print
print "Jaccard similarity of signature matrix"
js_dict_sig_matrix = col_wise_jaccard_sim(sig_matrix)
for k,v in js_dict_sig_matrix.items():
print k,v
IyAtKi0gY29kaW5nOiB1dGYtOCAtKi0KIiIiCkNyZWF0ZWQgb24gTW9uIE9jdCAyMCAxMTo0OToyNCAyMDE0CkBhdXRob3I6IGdzdWJyYW1hbmlhbgoiIiIKCmltcG9ydCBudW1weSBhcyBucApmcm9tIGNvbGxlY3Rpb25zIGltcG9ydCBkZWZhdWx0ZGljdAoKZGVmIGNvbF93aXNlX2phY2NhcmRfc2ltKGluX21hdHJpeCk6CiAgICByb3dzLGNvbHMgPSBpbl9tYXRyaXguc2hhcGUKICAgIHNpbV9saXN0ID0gZGVmYXVsdGRpY3QoZmxvYXQpCiAgICBmb3IgY29sIGluIHJhbmdlKGNvbHMpOgogICAgICAgIGZvciBueHRfY29sIGluIHJhbmdlKGNvbCsxLGNvbHMpOgogICAgICAgICAgICBhID0gaW5fbWF0cml4WzosY29sXQogICAgICAgICAgICBiID0gaW5fbWF0cml4Wzosbnh0X2NvbF0KICAgICAgICAgICAgZXF1YWxfY291bnQgPSAwCiAgICAgICAgICAgIHRvdGFsID0gYS5zaXplICsgYi5zaXplCiAgICAgICAgICAgIGZvciByIGluIHJhbmdlKHJvd3MpOgogICAgICAgICAgICAgICAgaWYgYVtyLDBdID09IGJbciwwXToKICAgICAgICAgICAgICAgICAgICBlcXVhbF9jb3VudCs9MQogICAgICAgICAgICBrZXkgPSBzdHIoY29sKSArICItIiArIHN0cihueHRfY29sKQogICAgICAgICAgICBzaW1fbGlzdFtrZXldID0gZXF1YWxfY291bnQvKDEuMCp0b3RhbCkKICAgIHJldHVybiBzaW1fbGlzdAogICAgICAgICAgICAgICAgCgpkZWYgZ2V0U2lnTWF0cml4KGNoYXJfbWF0cml4LGhhc2hfZnVuY3MpOgogICAgIiIiCiAgICBSZXR1cm5zIGEgc2lnbmF0dXJlIG1hdHJpeAogICAgQXJnczoKICAgICAgICBJbnB1dCA6IGNoYXJfbWF0cml4IC0gY2hhcmFjdGVyaXN0aWMgbWF0cml4CiAgICAgICAgICAgICAgICAKICAgICIiIgogICAgcm93cyxjb2xzID0gY2hhcl9tYXRyaXguc2hhcGUKICAgIG5vX2hhc2ggPSBsZW4oaGFzaF9mdW5jcykKICAgIHNpZ19tYXRyaXggPW5wLmZ1bGwoKG5vX2hhc2ggLGNvbHMpLG5wLmluZikucmVzaGFwZShub19oYXNoLGNvbHMpCiAgICBzaWdfbWF0cml4ID0gbnAubWF0cml4KHNpZ19tYXRyaXgpIAogICAgZm9yIHJvdyBpbiByYW5nZShyb3dzKToKICAgICAgICByb3dfdmFsdWUgPSBjaGFyX21hdHJpeFtyb3csOl0KICAgICAgICBoX3ZhbHVlID0gW10KICAgICAgICBmb3IgaF9mdW5jIGluIGhhc2hfZnVuY3M6CiAgICAgICAgICAgIGhfdmFsdWUuYXBwZW5kKGhfZnVuYyhyb3cpKQogICAgICAgIGZvciBjb2wgaW4gcmFuZ2Uocm93X3ZhbHVlLnNpemUpOgogICAgICAgICAgICBpZiByb3dfdmFsdWVbMCxjb2xdID09IDE6CiAgICAgICAgICAgICAgICBmb3IgaiBpbiByYW5nZShub19oYXNoKToKICAgICAgICAgICAgICAgICAgICBpZiBzaWdfbWF0cml4W2osY29sXSA+IGhfdmFsdWVbal06CiAgICAgICAgICAgICAgICAgICAgICAgIHNpZ19tYXRyaXhbaixjb2xdID0gaF92YWx1ZVtqXQogICAgcmV0dXJuIHNpZ19tYXRyaXgKCgppZiBfX25hbWVfXyA9PSAiX19tYWluX18iOgogICAgIiIiCiAgICBFeGFtcGxlIDMuOCA6IAogICAgKGEpIGgzKHgpID0geCsxIG1vZCA1CiAgICAoYikgaDQoeCkgPSAzeCsxIG1vZCA1CiAgICAgIEVsZW1lbnQgUzEgUzIgUzMgUzQKICAgICAgICAgICAgMCAgMSAwICAwICAxCiAgICAgICAgICAgIDEgIDAgMCAgMSAgMAogICAgICAgICAgICAyICAwIDEgIDAgIDEKICAgICAgICAgICAgMyAgMSAwICAxICAxCiAgICAgICAgICAgIDQgIDAgMCAgMSAgMAogICAgIiIiCiAgICAKICAgIGhhc2hfZnVuY3Rpb25zID0gW10KICAgIGhhc2hfZnVuY3Rpb25zLmFwcGVuZChsYW1iZGEgeDooeCsxKSU1KQogICAgaGFzaF9mdW5jdGlvbnMuYXBwZW5kKGxhbWJkYSB4OigzKngrMSklNSkKICAgIAogICAgCiAgICBjaGFyX21hdHJpeCA9IG5wLm1hdHJpeChbCiAgICAgICAgICAgICAgICAgICAgICAgIAlbMSwwLDAsMV0sCiAgICAgICAgICAgICAgICAgICAgICAgICAgIFswLDAsMSwwXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgWzAsMSwwLDFdLAogICAgICAgICAgICAgICAgICAgICAgICAgICBbMSwwLDEsMV0sCiAgICAgICAgICAgICAgICAgICAgICAgICAgIFswLDAsMSwwXQogICAgICAgICAgICAgICAgICAgICAgICBdKQogICAgCiAgICBwcmludCBnZXRTaWdNYXRyaXgoY2hhcl9tYXRyaXgsaGFzaF9mdW5jdGlvbnMpICAgICAgICAgICAgICAgICAgICAKICAgIAogICAgCiAgICAiIiIKICAgIEV4ZXJjaXNlIDMuMy4yIDogVXNpbmcgdGhlIGRhdGEgZnJvbSBGaWcuIDMuNCwgYWRkIHRvIHRoZSBzaWduYXR1cmVzIG9mIHRoZQogICAgY29sdW1ucyB0aGUgdmFsdWVzIG9mIHRoZSBmb2xsb3dpbmcgaGFzaCBmdW5jdGlvbnM6CiAgICAoYSkgaDMoeCkgPSAyeCArIDQuCiAgICAoYikgaDQoeCkgPSAzeCDiiJIgMS4KICAgICAgRWxlbWVudCBTMSBTMiBTMyBTNAogICAgICAgICAgICAwICAwIDEgIDAgIDEKICAgICAgICAgICAgMSAgMCAxICAwICAwCiAgICAgICAgICAgIDIgIDEgMCAgMCAgMQogICAgICAgICAgICAzICAwIDAgIDEgIDAKICAgICAgICAgICAgNCAgMCAwICAxICAxCiAgICAgICAgICAgIDUgIDEgMCAgMCAgMAogICAgIiIiCiAgICAKICAgIGhhc2hfZnVuY3Rpb25zID0gW10KICAgIGhhc2hfZnVuY3Rpb25zLmFwcGVuZChsYW1iZGEgeDooeCsxKSU1KQogICAgaGFzaF9mdW5jdGlvbnMuYXBwZW5kKGxhbWJkYSB4OigzKngrMSklNSkKICAgIGhhc2hfZnVuY3Rpb25zLmFwcGVuZChsYW1iZGEgeDooMip4KzQpJTUpCiAgICBoYXNoX2Z1bmN0aW9ucy5hcHBlbmQobGFtYmRhIHg6KDMqeC0xKSU1KQogICAgCiAgICAKICAgIGNoYXJfbWF0cml4ID0gbnAubWF0cml4KFsKICAgICAgICAgICAgICAgICAgICAgICAgCVsxLDAsMCwxXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgWzAsMCwxLDBdLAogICAgICAgICAgICAgICAgICAgICAgICAgICBbMCwxLDAsMV0sCiAgICAgICAgICAgICAgICAgICAgICAgICAgIFsxLDAsMSwxXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgWzAsMCwxLDBdCiAgICAgICAgICAgICAgICAgICAgICAgIF0pCiAgICAKICAgIHByaW50IGdldFNpZ01hdHJpeChjaGFyX21hdHJpeCxoYXNoX2Z1bmN0aW9ucykgICAgICAgICAgICAgICAgICAgIAogICAgCiAgICAiIiIKICAgIEV4ZXJjaXNlIDMuMy4zIDogSW4gRmlnLiAzLjUgaXMgYSBtYXRyaXggd2l0aCBzaXggcm93cy4KICAgIAogICAgKGEpIENvbXB1dGUgdGhlIG1pbmhhc2ggc2lnbmF0dXJlIGZvciBlYWNoIGNvbHVtbiBpZiB3ZSB1c2UgdGhlIGZvbGxvd2luZwogICAgdGhyZWUgaGFzaCBmdW5jdGlvbnM6IGgxKHgpID0gMnggKyAxIG1vZCA2OyBoMih4KSA9IDN4ICsgMiBtb2QgNjsKICAgIGgzKHgpID0gNXggKyAyIG1vZCA2LgogICAgKGIpIFdoaWNoIG9mIHRoZXNlIGhhc2ggZnVuY3Rpb25zIGFyZSB0cnVlIHBlcm11dGF0aW9ucz8KICAgIChjKSBIb3cgY2xvc2UgYXJlIHRoZSBlc3RpbWF0ZWQgSmFjY2FyZCBzaW1pbGFyaXRpZXMgZm9yIHRoZSBzaXggcGFpcnMgb2YgY29sdW1ucwogICAgdG8gdGhlIHRydWUgSmFjY2FyZCBzaW1pbGFyaXRpZXM/CiAgICAiIiIKICAgIAogICAgY2hhcl9tYXRyaXggPSBucC5tYXRyaXgoWwogICAgICAgICAgICAgICAgICAgICAgICAJWzAsMSwwLDFdLAogICAgICAgICAgICAgICAgICAgICAgICAgICBbMCwxLDAsMF0sCiAgICAgICAgICAgICAgICAgICAgICAgICAgIFsxLDAsMCwxXSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgWzAsMCwxLDBdLAogICAgICAgICAgICAgICAgICAgICAgICAgICBbMCwwLDEsMV0sCiAgICAgICAgICAgICAgICAgICAgICAgICAgIFsxLDAsMCwwXSAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgIF0pCiAgICBoYXNoX2Z1bmN0aW9ucyA9IFtdCiAgICAKICAgIGhhc2hfZnVuY3Rpb25zLmFwcGVuZChsYW1iZGEgeDooMip4KzEpJTYpCiAgICBoYXNoX2Z1bmN0aW9ucy5hcHBlbmQobGFtYmRhIHg6KDMqeCsyKSU2KQogICAgaGFzaF9mdW5jdGlvbnMuYXBwZW5kKGxhbWJkYSB4Oig1KngrMiklNikKICAgIGhhc2hfZnVuY3Rpb25zLmFwcGVuZChsYW1iZGEgeDooNip4KzIpJTYpCiAgICAKICAgIHNpZ19tYXRyaXggPSBnZXRTaWdNYXRyaXgoY2hhcl9tYXRyaXgsaGFzaF9mdW5jdGlvbnMpICAgICAgICAgICAgICAgICAgICAKICAgIAogICAgIyBNaW4gaGFzaCBzaWduYXR1cmVzCiAgICBwcmludCBzaWdfbWF0cml4CiAgICAKICAgIHByaW50CiAgICBwcmludCAiSmFjY2FyZCBzaW1pbGFyaXR5IG9mIGNoYXJhY3RlcmlzdGljIG1hdHJpeCIKICAgIGpzX2RpY3RfY2hhcl9tYXRyaXggPSBjb2xfd2lzZV9qYWNjYXJkX3NpbShjaGFyX21hdHJpeCkKICAgIGZvciBrLHYgaW4gIGpzX2RpY3RfY2hhcl9tYXRyaXguaXRlbXMoKToKICAgICAgICBwcmludCBrLHYKICAgIAogICAgcHJpbnQKICAgIAogICAgcHJpbnQgIkphY2NhcmQgc2ltaWxhcml0eSBvZiBzaWduYXR1cmUgbWF0cml4IgogICAganNfZGljdF9zaWdfbWF0cml4ID0gY29sX3dpc2VfamFjY2FyZF9zaW0oc2lnX21hdHJpeCkKICAgIGZvciBrLHYgaW4gIGpzX2RpY3Rfc2lnX21hdHJpeC5pdGVtcygpOgogICAgICAgIHByaW50IGssdgo=