1
|
|
|
import getopt |
2
|
|
|
import os |
3
|
|
|
import sys |
4
|
|
|
import time |
5
|
|
|
from concurrent.futures import ThreadPoolExecutor |
6
|
|
|
from concurrent.futures import ProcessPoolExecutor |
7
|
|
|
import numpy as np |
8
|
|
|
import math |
9
|
|
|
|
10
|
|
|
PROCESS_NUM = 12 |
11
|
|
|
GET_VEC = False |
12
|
|
|
CSV = False |
13
|
|
|
UINT8 = False |
14
|
|
|
|
15
|
|
|
BASE_FOLDER_NAME = '/data/milvus/base' |
16
|
|
|
NQ_FOLDER_NAME = '/data/milvus/query' |
17
|
|
|
|
18
|
|
|
GT_ALL_FOLDER_NAME = 'ground_truth_all' |
19
|
|
|
GT_FOLDER_NAME = 'ground_truth' |
20
|
|
|
LOC_FILE_NAME = 'ground_truth.txt' |
21
|
|
|
FLOC_FILE_NAME = 'file_ground_truth.txt' |
22
|
|
|
VEC_FILE_NAME = 'vectors.npy' |
23
|
|
|
|
24
|
|
|
|
25
|
|
|
# get vectors of the files |
26
|
|
|
def load_query_vec(nq, vectors=[], length=0): |
27
|
|
|
filenames = os.listdir(NQ_FOLDER_NAME) |
28
|
|
|
filenames.sort() |
29
|
|
|
for filename in filenames: |
30
|
|
|
vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename) |
31
|
|
|
length += len(vec_list) |
32
|
|
|
if nq!=0 and length>nq : |
33
|
|
|
num = nq % len(vec_list) |
34
|
|
|
vectors += vec_list[0:num] |
35
|
|
|
break |
36
|
|
|
vectors += vec_list |
37
|
|
|
return vectors |
38
|
|
|
|
39
|
|
|
|
40
|
|
|
# load vectors from filr_name and num means nq's number |
41
|
|
|
def load_vec_list(file_name): |
42
|
|
|
if CSV: |
43
|
|
|
import pandas as pd |
44
|
|
|
data = pd.read_csv(file_name, header=None) |
45
|
|
|
data = np.array(data) |
46
|
|
|
else: |
47
|
|
|
data = np.load(file_name) |
48
|
|
|
if UINT8: |
49
|
|
|
data = (data + 0.5) / 255 |
50
|
|
|
vec_list = data.tolist() |
51
|
|
|
return vec_list |
52
|
|
|
|
53
|
|
|
|
54
|
|
|
def hex_to_bin(fp): |
55
|
|
|
vec=[] |
56
|
|
|
length = len(fp) * 4 |
57
|
|
|
bstr = str(bin(int(fp,16))) |
58
|
|
|
bstr = (length-(len(bstr)-2)) * '0' + bstr[2:] |
59
|
|
|
for f in bstr: |
60
|
|
|
vec.append(int(f)) |
61
|
|
|
return vec |
62
|
|
|
|
63
|
|
|
|
64
|
|
|
def calEuclideanDistance(vec1, vec2): |
65
|
|
|
vec1 = np.array(vec1) |
66
|
|
|
vec2 = np.array(vec2) |
67
|
|
|
dist = np.sqrt(np.sum(np.square(vec1 - vec2))) |
68
|
|
|
return dist |
69
|
|
|
|
70
|
|
|
|
71
|
|
|
def calInnerDistance(vec1, vec2): |
72
|
|
|
vec1 = np.array(vec1) |
73
|
|
|
vec2 = np.array(vec2) |
74
|
|
|
dist = np.inner(vec1, vec2) |
75
|
|
|
return dist |
76
|
|
|
|
77
|
|
|
|
78
|
|
|
def calTanimoto(vec1, vec2): |
79
|
|
|
vec1 = hex_to_bin(vec1) |
80
|
|
|
vec2 = hex_to_bin(vec2) |
81
|
|
|
# print(vec1,vec2) |
82
|
|
|
nc = float(np.inner(vec1, vec2)) |
83
|
|
|
n1 = float(np.sum(vec1)) |
84
|
|
|
n2 = float(np.sum(vec2)) |
85
|
|
|
dist = nc/(n1+n2-nc) |
86
|
|
|
print(nc,n1,n2) |
87
|
|
|
return dist |
88
|
|
|
|
89
|
|
|
|
90
|
|
View Code Duplication |
def get_ground_truth_l2(topk, idx, vct_nq): |
|
|
|
|
91
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) |
92
|
|
|
filenames.sort() |
93
|
|
|
no_dist = {} |
94
|
|
|
k = 0 |
95
|
|
|
for filename in filenames: |
96
|
|
|
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
97
|
|
|
for j in range(len(vec_list)): |
98
|
|
|
dist = calEuclideanDistance(vct_nq, vec_list[j]) |
99
|
|
|
num_j = "%01d%03d%06d" % (8, k, j) |
100
|
|
|
if k==0 and j<topk : |
101
|
|
|
no_dist[num_j] = dist |
102
|
|
|
else: |
103
|
|
|
# sorted by values |
104
|
|
|
max_key = max(no_dist, key=no_dist.get) |
105
|
|
|
max_value = no_dist[max_key] |
106
|
|
|
if dist < max_value: |
107
|
|
|
m = no_dist.pop(max_key) |
108
|
|
|
no_dist[num_j] = dist |
109
|
|
|
k += 1 |
110
|
|
|
no_dist = sorted(no_dist.items(), key=lambda x: x[1]) |
111
|
|
|
print(no_dist) |
112
|
|
|
save_gt_file(no_dist, idx) |
113
|
|
|
|
114
|
|
|
|
115
|
|
View Code Duplication |
def get_ground_truth_ip(topk, idx, vct_nq): |
|
|
|
|
116
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
117
|
|
|
filenames.sort() |
118
|
|
|
no_dist = {} |
119
|
|
|
k = 0 |
120
|
|
|
for filename in filenames: |
121
|
|
|
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
122
|
|
|
for j in range(len(vec_list)): |
123
|
|
|
dist = calInnerDistance(vct_nq, vec_list[j]) |
124
|
|
|
num_j = "%03d%06d" % (k, j) |
125
|
|
|
if k==0 and j<topk : |
126
|
|
|
no_dist[num_j] = dist |
127
|
|
|
else: |
128
|
|
|
min_key = min(no_dist, key=no_dist.get) |
129
|
|
|
min_value = no_dist[min_key] |
130
|
|
|
if dist > min_value: |
131
|
|
|
m = no_dist.pop(min_key) |
132
|
|
|
no_dist[num_j] = dist |
133
|
|
|
k += 1 |
134
|
|
|
no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
135
|
|
|
print(no_dist) |
136
|
|
|
save_gt_file(no_dist, idx) |
137
|
|
|
|
138
|
|
|
|
139
|
|
|
def get_ground_truth_tanimoto(topk, idx, vec_nq): |
140
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
141
|
|
|
filenames.sort() |
142
|
|
|
no_dist = {} |
143
|
|
|
k = 0 |
144
|
|
|
for filename in filenames: |
145
|
|
|
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
146
|
|
|
print(BASE_FOLDER_NAME + '/' + filename, len(vec_list)) |
147
|
|
|
for j in range(len(vec_list)): |
148
|
|
|
dist = calTanimoto(vec_nq, vec_list[j]) |
149
|
|
|
num_j = "%03d%06d" % (k, j) |
150
|
|
|
if k==0 and j<topk : |
151
|
|
|
no_dist[num_j] = dist |
152
|
|
|
else: |
153
|
|
|
min_key = min(no_dist, key=no_dist.get) |
154
|
|
|
min_value = no_dist[min_key] |
155
|
|
|
if dist > min_value: |
156
|
|
|
m = no_dist.pop(min_key) |
157
|
|
|
no_dist[num_j] = dist |
158
|
|
|
k += 1 |
159
|
|
|
no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
160
|
|
|
print(no_dist) |
161
|
|
|
save_gt_file(no_dist, idx) |
162
|
|
|
|
163
|
|
|
|
164
|
|
|
def save_gt_file(no_dist, idx): |
165
|
|
|
filename = "%05d" % idx + 'results.txt' |
166
|
|
|
with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f: |
167
|
|
|
for re in no_dist: |
168
|
|
|
f.write(str(re[0]) + ' ' + str(re[1]) + '\n') |
169
|
|
|
|
170
|
|
|
def get_loc_txt(file): |
171
|
|
|
filenames = os.listdir(GT_ALL_FOLDER_NAME) |
172
|
|
|
filenames.sort() |
173
|
|
|
write_file = open(GT_FOLDER_NAME + '/' + file, 'w+') |
174
|
|
|
for f in filenames: |
175
|
|
|
for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'): |
176
|
|
|
write_file.write(line) |
177
|
|
|
write_file.write('\n') |
178
|
|
|
|
179
|
|
|
|
180
|
|
|
def get_file_loc_txt(gt_file, fnames_file): |
181
|
|
|
filenames = os.listdir(BASE_FOLDER_NAME) |
182
|
|
|
filenames.sort() |
183
|
|
|
with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f: |
184
|
|
|
with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f: |
185
|
|
|
for line in gt_f: |
186
|
|
|
if line != '\n': |
187
|
|
|
line = line.split()[0] |
188
|
|
|
loca = int(line[1:4]) |
189
|
|
|
offset = int(line[4:10]) |
190
|
|
|
fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n') |
191
|
|
|
else: |
192
|
|
|
fnames_f.write(line) |
193
|
|
|
|
194
|
|
|
|
195
|
|
|
def load_gt_file_out(): |
196
|
|
|
file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME |
197
|
|
|
base_filename = [] |
198
|
|
|
num = [] |
199
|
|
|
with open(file_name, 'r') as f: |
200
|
|
|
for line in f.readlines(): |
201
|
|
|
data = line.split() |
202
|
|
|
if data: |
203
|
|
|
base_filename.append(data[0]) |
204
|
|
|
num.append(data[1]) |
205
|
|
|
return base_filename, num |
206
|
|
|
|
207
|
|
|
|
208
|
|
|
def ground_truth_process(metric,nq_list, topk, num): |
209
|
|
|
thread_num = len(nq_list) |
210
|
|
|
with ProcessPoolExecutor(thread_num) as executor: |
211
|
|
|
for i in range(thread_num): |
212
|
|
|
# print("Process:",num+i) |
213
|
|
|
if metric == 'L2': |
214
|
|
|
executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i]) |
215
|
|
|
elif metric == 'IP': |
216
|
|
|
executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i]) |
217
|
|
|
elif metric == 'Tan': |
218
|
|
|
executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i]) |
219
|
|
|
get_loc_txt(LOC_FILE_NAME) |
220
|
|
|
get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME) |
221
|
|
|
if GET_VEC: |
222
|
|
|
vec = [] |
223
|
|
|
file, num = load_gt_file_out() |
224
|
|
|
for i in range(len(file)): |
225
|
|
|
n = int(num[i]) - 1 |
226
|
|
|
vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i]) |
227
|
|
|
vec.append(vectors[n]) |
228
|
|
|
print("saved len of vec:", len(vec)) |
229
|
|
|
np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec) |
230
|
|
|
|
231
|
|
|
|
232
|
|
|
def main(): |
233
|
|
|
try: |
234
|
|
|
opts, args = getopt.getopt( |
235
|
|
|
sys.argv[1:], |
236
|
|
|
"hlq:k:m:", |
237
|
|
|
["help", "nq=", "topk=", "metric="], |
238
|
|
|
) |
239
|
|
|
except getopt.GetoptError: |
240
|
|
|
print("Usage: test.py [-q <nq>] -k <topk> -s") |
241
|
|
|
sys.exit(2) |
242
|
|
|
nq = 0 |
243
|
|
|
for opt_name, opt_value in opts: |
244
|
|
|
if opt_name in ("-h", "--help"): |
245
|
|
|
print("test.py [-q <nq>] -k <topk> -l") |
246
|
|
|
sys.exit() |
247
|
|
|
elif opt_name in ("-q", "--nq"): |
248
|
|
|
nq = int(opt_value) |
249
|
|
|
elif opt_name in ("-k", "--topk"): |
250
|
|
|
topk = int(opt_value) |
251
|
|
|
elif opt_name in ("-m", "--metric"): |
252
|
|
|
metric = opt_value |
253
|
|
|
elif opt_name == "-l": # test.py [-q <nq>] -k <topk> -m -l |
254
|
|
|
try: |
255
|
|
|
os.mkdir(GT_ALL_FOLDER_NAME) |
256
|
|
|
except: |
257
|
|
|
print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.') |
258
|
|
|
sys.exit() |
259
|
|
|
if not os.path.exists(GT_FOLDER_NAME): |
260
|
|
|
os.mkdir(GT_FOLDER_NAME) |
261
|
|
|
|
262
|
|
|
print("metric type is",metric) |
|
|
|
|
263
|
|
|
time_start = time.time() |
264
|
|
|
query_vectors = load_query_vec(nq) |
265
|
|
|
nq = len(query_vectors) |
266
|
|
|
print("query list:", len(query_vectors)) |
267
|
|
|
num = math.ceil(nq/PROCESS_NUM) |
268
|
|
|
for i in range(num): |
269
|
|
|
print("start with round:",i+1) |
270
|
|
|
if i==num-1: |
271
|
|
|
ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM) |
|
|
|
|
272
|
|
|
else: |
273
|
|
|
ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM) |
274
|
|
|
|
275
|
|
|
time_end = time.time() |
276
|
|
|
time_cost = time_end - time_start |
277
|
|
|
print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!") |
278
|
|
|
|
279
|
|
|
|
280
|
|
|
if __name__ == '__main__': |
281
|
|
|
main() |
282
|
|
|
|