|
1
|
|
|
import getopt |
|
2
|
|
|
import os |
|
3
|
|
|
import sys |
|
4
|
|
|
import time |
|
5
|
|
|
from concurrent.futures import ThreadPoolExecutor |
|
6
|
|
|
from concurrent.futures import ProcessPoolExecutor |
|
7
|
|
|
import numpy as np |
|
8
|
|
|
import math |
|
9
|
|
|
|
|
10
|
|
|
PROCESS_NUM = 12 |
|
11
|
|
|
GET_VEC = False |
|
12
|
|
|
CSV = False |
|
13
|
|
|
UINT8 = False |
|
14
|
|
|
|
|
15
|
|
|
BASE_FOLDER_NAME = '/data/milvus/base' |
|
16
|
|
|
NQ_FOLDER_NAME = '/data/milvus/query' |
|
17
|
|
|
|
|
18
|
|
|
GT_ALL_FOLDER_NAME = 'ground_truth_all' |
|
19
|
|
|
GT_FOLDER_NAME = 'ground_truth' |
|
20
|
|
|
LOC_FILE_NAME = 'ground_truth.txt' |
|
21
|
|
|
FLOC_FILE_NAME = 'file_ground_truth.txt' |
|
22
|
|
|
VEC_FILE_NAME = 'vectors.npy' |
|
23
|
|
|
|
|
24
|
|
|
|
|
25
|
|
|
# get vectors of the files |
|
26
|
|
|
def load_query_vec(nq, vectors=[], length=0): |
|
27
|
|
|
filenames = os.listdir(NQ_FOLDER_NAME) |
|
28
|
|
|
filenames.sort() |
|
29
|
|
|
for filename in filenames: |
|
30
|
|
|
vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename) |
|
31
|
|
|
length += len(vec_list) |
|
32
|
|
|
if nq!=0 and length>nq : |
|
33
|
|
|
num = nq % len(vec_list) |
|
34
|
|
|
vectors += vec_list[0:num] |
|
35
|
|
|
break |
|
36
|
|
|
vectors += vec_list |
|
37
|
|
|
return vectors |
|
38
|
|
|
|
|
39
|
|
|
|
|
40
|
|
|
# load vectors from filr_name and num means nq's number |
|
41
|
|
|
def load_vec_list(file_name): |
|
42
|
|
|
if CSV: |
|
43
|
|
|
import pandas as pd |
|
44
|
|
|
data = pd.read_csv(file_name, header=None) |
|
45
|
|
|
data = np.array(data) |
|
46
|
|
|
else: |
|
47
|
|
|
data = np.load(file_name) |
|
48
|
|
|
if UINT8: |
|
49
|
|
|
data = (data + 0.5) / 255 |
|
50
|
|
|
vec_list = data.tolist() |
|
51
|
|
|
return vec_list |
|
52
|
|
|
|
|
53
|
|
|
|
|
54
|
|
|
def hex_to_bin(fp): |
|
55
|
|
|
vec=[] |
|
56
|
|
|
length = len(fp) * 4 |
|
57
|
|
|
bstr = str(bin(int(fp,16))) |
|
58
|
|
|
bstr = (length-(len(bstr)-2)) * '0' + bstr[2:] |
|
59
|
|
|
for f in bstr: |
|
60
|
|
|
vec.append(int(f)) |
|
61
|
|
|
return vec |
|
62
|
|
|
|
|
63
|
|
|
|
|
64
|
|
|
def calEuclideanDistance(vec1, vec2): |
|
65
|
|
|
vec1 = np.array(vec1) |
|
66
|
|
|
vec2 = np.array(vec2) |
|
67
|
|
|
dist = np.sqrt(np.sum(np.square(vec1 - vec2))) |
|
68
|
|
|
return dist |
|
69
|
|
|
|
|
70
|
|
|
|
|
71
|
|
|
def calInnerDistance(vec1, vec2): |
|
72
|
|
|
vec1 = np.array(vec1) |
|
73
|
|
|
vec2 = np.array(vec2) |
|
74
|
|
|
dist = np.inner(vec1, vec2) |
|
75
|
|
|
return dist |
|
76
|
|
|
|
|
77
|
|
|
|
|
78
|
|
|
def calTanimoto(vec1, vec2): |
|
79
|
|
|
vec1 = hex_to_bin(vec1) |
|
80
|
|
|
vec2 = hex_to_bin(vec2) |
|
81
|
|
|
# print(vec1,vec2) |
|
82
|
|
|
nc = float(np.inner(vec1, vec2)) |
|
83
|
|
|
n1 = float(np.sum(vec1)) |
|
84
|
|
|
n2 = float(np.sum(vec2)) |
|
85
|
|
|
dist = nc/(n1+n2-nc) |
|
86
|
|
|
print(nc,n1,n2) |
|
87
|
|
|
return dist |
|
88
|
|
|
|
|
89
|
|
|
|
|
90
|
|
View Code Duplication |
def get_ground_truth_l2(topk, idx, vct_nq): |
|
|
|
|
|
|
91
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) |
|
92
|
|
|
filenames.sort() |
|
93
|
|
|
no_dist = {} |
|
94
|
|
|
k = 0 |
|
95
|
|
|
for filename in filenames: |
|
96
|
|
|
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
|
97
|
|
|
for j in range(len(vec_list)): |
|
98
|
|
|
dist = calEuclideanDistance(vct_nq, vec_list[j]) |
|
99
|
|
|
num_j = "%01d%03d%06d" % (8, k, j) |
|
100
|
|
|
if k==0 and j<topk : |
|
101
|
|
|
no_dist[num_j] = dist |
|
102
|
|
|
else: |
|
103
|
|
|
# sorted by values |
|
104
|
|
|
max_key = max(no_dist, key=no_dist.get) |
|
105
|
|
|
max_value = no_dist[max_key] |
|
106
|
|
|
if dist < max_value: |
|
107
|
|
|
m = no_dist.pop(max_key) |
|
108
|
|
|
no_dist[num_j] = dist |
|
109
|
|
|
k += 1 |
|
110
|
|
|
no_dist = sorted(no_dist.items(), key=lambda x: x[1]) |
|
111
|
|
|
print(no_dist) |
|
112
|
|
|
save_gt_file(no_dist, idx) |
|
113
|
|
|
|
|
114
|
|
|
|
|
115
|
|
View Code Duplication |
def get_ground_truth_ip(topk, idx, vct_nq): |
|
|
|
|
|
|
116
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
|
117
|
|
|
filenames.sort() |
|
118
|
|
|
no_dist = {} |
|
119
|
|
|
k = 0 |
|
120
|
|
|
for filename in filenames: |
|
121
|
|
|
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
|
122
|
|
|
for j in range(len(vec_list)): |
|
123
|
|
|
dist = calInnerDistance(vct_nq, vec_list[j]) |
|
124
|
|
|
num_j = "%03d%06d" % (k, j) |
|
125
|
|
|
if k==0 and j<topk : |
|
126
|
|
|
no_dist[num_j] = dist |
|
127
|
|
|
else: |
|
128
|
|
|
min_key = min(no_dist, key=no_dist.get) |
|
129
|
|
|
min_value = no_dist[min_key] |
|
130
|
|
|
if dist > min_value: |
|
131
|
|
|
m = no_dist.pop(min_key) |
|
132
|
|
|
no_dist[num_j] = dist |
|
133
|
|
|
k += 1 |
|
134
|
|
|
no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
|
135
|
|
|
print(no_dist) |
|
136
|
|
|
save_gt_file(no_dist, idx) |
|
137
|
|
|
|
|
138
|
|
|
|
|
139
|
|
|
def get_ground_truth_tanimoto(topk, idx, vec_nq): |
|
140
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
|
141
|
|
|
filenames.sort() |
|
142
|
|
|
no_dist = {} |
|
143
|
|
|
k = 0 |
|
144
|
|
|
for filename in filenames: |
|
145
|
|
|
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
|
146
|
|
|
print(BASE_FOLDER_NAME + '/' + filename, len(vec_list)) |
|
147
|
|
|
for j in range(len(vec_list)): |
|
148
|
|
|
dist = calTanimoto(vec_nq, vec_list[j]) |
|
149
|
|
|
num_j = "%03d%06d" % (k, j) |
|
150
|
|
|
if k==0 and j<topk : |
|
151
|
|
|
no_dist[num_j] = dist |
|
152
|
|
|
else: |
|
153
|
|
|
min_key = min(no_dist, key=no_dist.get) |
|
154
|
|
|
min_value = no_dist[min_key] |
|
155
|
|
|
if dist > min_value: |
|
156
|
|
|
m = no_dist.pop(min_key) |
|
157
|
|
|
no_dist[num_j] = dist |
|
158
|
|
|
k += 1 |
|
159
|
|
|
no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
|
160
|
|
|
print(no_dist) |
|
161
|
|
|
save_gt_file(no_dist, idx) |
|
162
|
|
|
|
|
163
|
|
|
|
|
164
|
|
|
def save_gt_file(no_dist, idx): |
|
165
|
|
|
filename = "%05d" % idx + 'results.txt' |
|
166
|
|
|
with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f: |
|
167
|
|
|
for re in no_dist: |
|
168
|
|
|
f.write(str(re[0]) + ' ' + str(re[1]) + '\n') |
|
169
|
|
|
|
|
170
|
|
|
def get_loc_txt(file): |
|
171
|
|
|
filenames = os.listdir(GT_ALL_FOLDER_NAME) |
|
172
|
|
|
filenames.sort() |
|
173
|
|
|
write_file = open(GT_FOLDER_NAME + '/' + file, 'w+') |
|
174
|
|
|
for f in filenames: |
|
175
|
|
|
for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'): |
|
176
|
|
|
write_file.write(line) |
|
177
|
|
|
write_file.write('\n') |
|
178
|
|
|
|
|
179
|
|
|
|
|
180
|
|
|
def get_file_loc_txt(gt_file, fnames_file): |
|
181
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) |
|
182
|
|
|
filenames.sort() |
|
183
|
|
|
with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f: |
|
184
|
|
|
with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f: |
|
185
|
|
|
for line in gt_f: |
|
186
|
|
|
if line != '\n': |
|
187
|
|
|
line = line.split()[0] |
|
188
|
|
|
loca = int(line[1:4]) |
|
189
|
|
|
offset = int(line[4:10]) |
|
190
|
|
|
fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n') |
|
191
|
|
|
else: |
|
192
|
|
|
fnames_f.write(line) |
|
193
|
|
|
|
|
194
|
|
|
|
|
195
|
|
|
def load_gt_file_out(): |
|
196
|
|
|
file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME |
|
197
|
|
|
base_filename = [] |
|
198
|
|
|
num = [] |
|
199
|
|
|
with open(file_name, 'r') as f: |
|
200
|
|
|
for line in f.readlines(): |
|
201
|
|
|
data = line.split() |
|
202
|
|
|
if data: |
|
203
|
|
|
base_filename.append(data[0]) |
|
204
|
|
|
num.append(data[1]) |
|
205
|
|
|
return base_filename, num |
|
206
|
|
|
|
|
207
|
|
|
|
|
208
|
|
|
def ground_truth_process(metric,nq_list, topk, num): |
|
209
|
|
|
thread_num = len(nq_list) |
|
210
|
|
|
with ProcessPoolExecutor(thread_num) as executor: |
|
211
|
|
|
for i in range(thread_num): |
|
212
|
|
|
# print("Process:",num+i) |
|
213
|
|
|
if metric == 'L2': |
|
214
|
|
|
executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i]) |
|
215
|
|
|
elif metric == 'IP': |
|
216
|
|
|
executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i]) |
|
217
|
|
|
elif metric == 'Tan': |
|
218
|
|
|
executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i]) |
|
219
|
|
|
get_loc_txt(LOC_FILE_NAME) |
|
220
|
|
|
get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME) |
|
221
|
|
|
if GET_VEC: |
|
222
|
|
|
vec = [] |
|
223
|
|
|
file, num = load_gt_file_out() |
|
224
|
|
|
for i in range(len(file)): |
|
225
|
|
|
n = int(num[i]) - 1 |
|
226
|
|
|
vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i]) |
|
227
|
|
|
vec.append(vectors[n]) |
|
228
|
|
|
print("saved len of vec:", len(vec)) |
|
229
|
|
|
np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec) |
|
230
|
|
|
|
|
231
|
|
|
|
|
232
|
|
|
def main(): |
|
233
|
|
|
try: |
|
234
|
|
|
opts, args = getopt.getopt( |
|
235
|
|
|
sys.argv[1:], |
|
236
|
|
|
"hlq:k:m:", |
|
237
|
|
|
["help", "nq=", "topk=", "metric="], |
|
238
|
|
|
) |
|
239
|
|
|
except getopt.GetoptError: |
|
240
|
|
|
print("Usage: test.py [-q <nq>] -k <topk> -s") |
|
241
|
|
|
sys.exit(2) |
|
242
|
|
|
nq = 0 |
|
243
|
|
|
for opt_name, opt_value in opts: |
|
244
|
|
|
if opt_name in ("-h", "--help"): |
|
245
|
|
|
print("test.py [-q <nq>] -k <topk> -l") |
|
246
|
|
|
sys.exit() |
|
247
|
|
|
elif opt_name in ("-q", "--nq"): |
|
248
|
|
|
nq = int(opt_value) |
|
249
|
|
|
elif opt_name in ("-k", "--topk"): |
|
250
|
|
|
topk = int(opt_value) |
|
251
|
|
|
elif opt_name in ("-m", "--metric"): |
|
252
|
|
|
metric = opt_value |
|
253
|
|
|
elif opt_name == "-l": # test.py [-q <nq>] -k <topk> -m -l |
|
254
|
|
|
try: |
|
255
|
|
|
os.mkdir(GT_ALL_FOLDER_NAME) |
|
256
|
|
|
except: |
|
257
|
|
|
print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.') |
|
258
|
|
|
sys.exit() |
|
259
|
|
|
if not os.path.exists(GT_FOLDER_NAME): |
|
260
|
|
|
os.mkdir(GT_FOLDER_NAME) |
|
261
|
|
|
|
|
262
|
|
|
print("metric type is",metric) |
|
|
|
|
|
|
263
|
|
|
time_start = time.time() |
|
264
|
|
|
query_vectors = load_query_vec(nq) |
|
265
|
|
|
nq = len(query_vectors) |
|
266
|
|
|
print("query list:", len(query_vectors)) |
|
267
|
|
|
num = math.ceil(nq/PROCESS_NUM) |
|
268
|
|
|
for i in range(num): |
|
269
|
|
|
print("start with round:",i+1) |
|
270
|
|
|
if i==num-1: |
|
271
|
|
|
ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM) |
|
|
|
|
|
|
272
|
|
|
else: |
|
273
|
|
|
ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM) |
|
274
|
|
|
|
|
275
|
|
|
time_end = time.time() |
|
276
|
|
|
time_cost = time_end - time_start |
|
277
|
|
|
print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!") |
|
278
|
|
|
|
|
279
|
|
|
|
|
280
|
|
|
if __name__ == '__main__': |
|
281
|
|
|
main() |
|
282
|
|
|
|