fork download
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Mon Oct 20 11:49:24 2014
  4. @author: gsubramanian
  5. """
  6.  
  7. import numpy as np
  8. from collections import defaultdict
  9.  
  10. def col_wise_jaccard_sim(in_matrix):
  11. rows,cols = in_matrix.shape
  12. sim_list = defaultdict(float)
  13. for col in range(cols):
  14. for nxt_col in range(col+1,cols):
  15. a = in_matrix[:,col]
  16. b = in_matrix[:,nxt_col]
  17. equal_count = 0
  18. total = a.size + b.size
  19. for r in range(rows):
  20. if a[r,0] == b[r,0]:
  21. equal_count+=1
  22. key = str(col) + "-" + str(nxt_col)
  23. sim_list[key] = equal_count/(1.0*total)
  24. return sim_list
  25.  
  26.  
  27. def getSigMatrix(char_matrix,hash_funcs):
  28. """
  29. Returns a signature matrix
  30. Args:
  31. Input : char_matrix - characteristic matrix
  32.  
  33. """
  34. rows,cols = char_matrix.shape
  35. no_hash = len(hash_funcs)
  36. sig_matrix =np.full((no_hash ,cols),np.inf).reshape(no_hash,cols)
  37. sig_matrix = np.matrix(sig_matrix)
  38. for row in range(rows):
  39. row_value = char_matrix[row,:]
  40. h_value = []
  41. for h_func in hash_funcs:
  42. h_value.append(h_func(row))
  43. for col in range(row_value.size):
  44. if row_value[0,col] == 1:
  45. for j in range(no_hash):
  46. if sig_matrix[j,col] > h_value[j]:
  47. sig_matrix[j,col] = h_value[j]
  48. return sig_matrix
  49.  
  50.  
  51. if __name__ == "__main__":
  52. """
  53. Example 3.8 :
  54. (a) h3(x) = x+1 mod 5
  55. (b) h4(x) = 3x+1 mod 5
  56. Element S1 S2 S3 S4
  57. 0 1 0 0 1
  58. 1 0 0 1 0
  59. 2 0 1 0 1
  60. 3 1 0 1 1
  61. 4 0 0 1 0
  62. """
  63.  
  64. hash_functions = []
  65. hash_functions.append(lambda x:(x+1)%5)
  66. hash_functions.append(lambda x:(3*x+1)%5)
  67.  
  68.  
  69. char_matrix = np.matrix([
  70. [1,0,0,1],
  71. [0,0,1,0],
  72. [0,1,0,1],
  73. [1,0,1,1],
  74. [0,0,1,0]
  75. ])
  76.  
  77. print getSigMatrix(char_matrix,hash_functions)
  78.  
  79.  
  80. """
  81. Exercise 3.3.2 : Using the data from Fig. 3.4, add to the signatures of the
  82. columns the values of the following hash functions:
  83. (a) h3(x) = 2x + 4.
  84. (b) h4(x) = 3x − 1.
  85. Element S1 S2 S3 S4
  86. 0 0 1 0 1
  87. 1 0 1 0 0
  88. 2 1 0 0 1
  89. 3 0 0 1 0
  90. 4 0 0 1 1
  91. 5 1 0 0 0
  92. """
  93.  
  94. hash_functions = []
  95. hash_functions.append(lambda x:2*x+4)
  96. hash_functions.append(lambda x:3*x-1)
  97.  
  98.  
  99. char_matrix = np.matrix([
  100. [1,0,0,1],
  101. [0,0,1,0],
  102. [0,1,0,1],
  103. [1,0,1,1],
  104. [0,0,1,0]
  105. ])
  106.  
  107. print getSigMatrix(char_matrix,hash_functions)
  108.  
  109. """
  110. Exercise 3.3.3 : In Fig. 3.5 is a matrix with six rows.
  111.  
  112. (a) Compute the minhash signature for each column if we use the following
  113. three hash functions: h1(x) = 2x + 1 mod 6; h2(x) = 3x + 2 mod 6;
  114. h3(x) = 5x + 2 mod 6.
  115. (b) Which of these hash functions are true permutations?
  116. (c) How close are the estimated Jaccard similarities for the six pairs of columns
  117. to the true Jaccard similarities?
  118. """
  119.  
  120. char_matrix = np.matrix([
  121. [0,1,0,1],
  122. [0,1,0,0],
  123. [1,0,0,1],
  124. [0,0,1,0],
  125. [0,0,1,1],
  126. [1,0,0,0]
  127. ])
  128. hash_functions = []
  129.  
  130. hash_functions.append(lambda x:(2*x+1)%6)
  131. hash_functions.append(lambda x:(3*x+2)%6)
  132. hash_functions.append(lambda x:(5*x+2)%6)
  133. hash_functions.append(lambda x:(6*x+2)%6)
  134.  
  135. sig_matrix = getSigMatrix(char_matrix,hash_functions)
  136.  
  137. # Min hash signatures
  138. print sig_matrix
  139.  
  140. print
  141. print "Jaccard similarity of characteristic matrix"
  142. js_dict_char_matrix = col_wise_jaccard_sim(char_matrix)
  143. for k,v in js_dict_char_matrix.items():
  144. print k,v
  145.  
  146. print
  147.  
  148. print "Jaccard similarity of signature matrix"
  149. js_dict_sig_matrix = col_wise_jaccard_sim(sig_matrix)
  150. for k,v in js_dict_sig_matrix.items():
  151. print k,v
  152.  
Success #stdin #stdout 0.08s 23388KB
stdin
Standard input is empty
stdout
[[1. 3. 0. 1.]
 [0. 2. 0. 0.]]
[[ 4.  8.  6.  4.]
 [-1.  5.  2. -1.]]
[[5. 1. 1. 1.]
 [2. 2. 2. 2.]
 [0. 1. 4. 0.]
 [2. 2. 2. 2.]]

Jaccard similarity of characteristic matrix
2-3 0.25
0-1 0.166666666667
0-2 0.166666666667
0-3 0.25
1-3 0.25
1-2 0.166666666667

Jaccard similarity of signature matrix
2-3 0.375
0-1 0.25
0-2 0.25
0-3 0.375
1-3 0.375
1-2 0.375