milvus_ground_truth.hex_to_bin()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 8
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 8
nop 1
dl 0
loc 8
rs 10
c 0
b 0
f 0
1
import getopt
2
import os
3
import sys
4
import time
5
from concurrent.futures import ThreadPoolExecutor
6
from concurrent.futures import ProcessPoolExecutor
7
import numpy as np
8
import math
9
10
PROCESS_NUM = 12
11
GET_VEC = False
12
CSV = False
13
UINT8 = False
14
15
BASE_FOLDER_NAME = '/data/milvus/base'
16
NQ_FOLDER_NAME = '/data/milvus/query'
17
18
GT_ALL_FOLDER_NAME = 'ground_truth_all'
19
GT_FOLDER_NAME = 'ground_truth'
20
LOC_FILE_NAME = 'ground_truth.txt'
21
FLOC_FILE_NAME = 'file_ground_truth.txt'
22
VEC_FILE_NAME = 'vectors.npy'
23
24
25
# get vectors of the files
26
def load_query_vec(nq, vectors=[], length=0):
27
    filenames = os.listdir(NQ_FOLDER_NAME)
28
    filenames.sort()
29
    for filename in filenames:
30
        vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename)
31
        length += len(vec_list)
32
        if nq!=0 and length>nq :
33
            num = nq % len(vec_list)
34
            vectors += vec_list[0:num]
35
            break
36
        vectors += vec_list
37
    return vectors
38
39
40
# load vectors from filr_name and num means nq's number
41
def load_vec_list(file_name):
42
    if CSV:
43
        import pandas as pd
44
        data = pd.read_csv(file_name, header=None)
45
        data = np.array(data)
46
    else:
47
        data = np.load(file_name)
48
    if UINT8:
49
        data = (data + 0.5) / 255
50
    vec_list = data.tolist()
51
    return vec_list
52
53
54
def hex_to_bin(fp):
55
    vec=[]
56
    length = len(fp) * 4
57
    bstr = str(bin(int(fp,16)))
58
    bstr = (length-(len(bstr)-2)) * '0' + bstr[2:]
59
    for f in bstr:
60
        vec.append(int(f))
61
    return vec
62
63
64
def calEuclideanDistance(vec1, vec2):
65
    vec1 = np.array(vec1)
66
    vec2 = np.array(vec2)
67
    dist = np.sqrt(np.sum(np.square(vec1 - vec2)))
68
    return dist
69
70
71
def calInnerDistance(vec1, vec2):
72
    vec1 = np.array(vec1)
73
    vec2 = np.array(vec2)
74
    dist = np.inner(vec1, vec2)
75
    return dist
76
77
78
def calTanimoto(vec1, vec2):
79
    vec1 = hex_to_bin(vec1)
80
    vec2 = hex_to_bin(vec2)
81
    # print(vec1,vec2)
82
    nc = float(np.inner(vec1, vec2))
83
    n1 = float(np.sum(vec1))
84
    n2 = float(np.sum(vec2))
85
    dist = nc/(n1+n2-nc)
86
    print(nc,n1,n2)
87
    return dist
88
89
90 View Code Duplication
def get_ground_truth_l2(topk, idx, vct_nq):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
91
    filenames = os.listdir(BASE_FOLDER_NAME)
92
    filenames.sort()
93
    no_dist = {}
94
    k = 0
95
    for filename in filenames:
96
        vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
97
        for j in range(len(vec_list)):
98
            dist = calEuclideanDistance(vct_nq, vec_list[j])
99
            num_j = "%01d%03d%06d" % (8, k, j)
100
            if k==0 and j<topk :
101
                no_dist[num_j] = dist
102
            else:
103
                # sorted by values
104
                max_key = max(no_dist, key=no_dist.get)
105
                max_value = no_dist[max_key]
106
                if dist < max_value:
107
                    m = no_dist.pop(max_key)
108
                    no_dist[num_j] = dist
109
        k += 1
110
    no_dist = sorted(no_dist.items(), key=lambda x: x[1])
111
    print(no_dist)
112
    save_gt_file(no_dist, idx)
113
114
115 View Code Duplication
def get_ground_truth_ip(topk, idx, vct_nq):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
116
    filenames = os.listdir(BASE_FOLDER_NAME)  # get the whole file names
117
    filenames.sort()
118
    no_dist = {}
119
    k = 0
120
    for filename in filenames:
121
        vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
122
        for j in range(len(vec_list)):
123
            dist = calInnerDistance(vct_nq, vec_list[j])
124
            num_j = "%03d%06d" % (k, j)
125
            if k==0 and j<topk :
126
                no_dist[num_j] = dist
127
            else:
128
                min_key = min(no_dist, key=no_dist.get)
129
                min_value = no_dist[min_key]
130
                if dist > min_value:
131
                    m = no_dist.pop(min_key)
132
                    no_dist[num_j] = dist
133
        k += 1
134
    no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
135
    print(no_dist)
136
    save_gt_file(no_dist, idx)
137
138
139
def get_ground_truth_tanimoto(topk, idx, vec_nq):
140
    filenames = os.listdir(BASE_FOLDER_NAME)  # get the whole file names
141
    filenames.sort()
142
    no_dist = {}
143
    k = 0
144
    for filename in filenames:
145
        vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
146
        print(BASE_FOLDER_NAME + '/' + filename, len(vec_list))
147
        for j in range(len(vec_list)):
148
            dist = calTanimoto(vec_nq, vec_list[j])
149
            num_j = "%03d%06d" % (k, j)
150
            if k==0 and j<topk :
151
                no_dist[num_j] = dist
152
            else:
153
                min_key = min(no_dist, key=no_dist.get)
154
                min_value = no_dist[min_key]
155
                if dist > min_value:
156
                    m = no_dist.pop(min_key)
157
                    no_dist[num_j] = dist
158
        k += 1
159
    no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
160
    print(no_dist)
161
    save_gt_file(no_dist, idx)
162
163
164
def save_gt_file(no_dist, idx):
165
    filename = "%05d" % idx + 'results.txt'
166
    with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f:
167
        for re in no_dist:
168
            f.write(str(re[0]) + ' ' + str(re[1]) + '\n')
169
170
def get_loc_txt(file):
171
    filenames = os.listdir(GT_ALL_FOLDER_NAME)
172
    filenames.sort()
173
    write_file = open(GT_FOLDER_NAME + '/' + file, 'w+')
174
    for f in filenames:
175
        for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'):
176
            write_file.write(line)
177
        write_file.write('\n')
178
179
180
def get_file_loc_txt(gt_file, fnames_file):
181
    filenames = os.listdir(BASE_FOLDER_NAME)
182
    filenames.sort()
183
    with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f:
184
        with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f:
185
            for line in gt_f:
186
                if line != '\n':
187
                    line = line.split()[0]
188
                    loca = int(line[1:4])
189
                    offset = int(line[4:10])
190
                    fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n')
191
                else:
192
                    fnames_f.write(line)
193
194
195
def load_gt_file_out():
196
    file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME
197
    base_filename = []
198
    num = []
199
    with open(file_name, 'r') as f:
200
        for line in f.readlines():
201
            data = line.split()
202
            if data:
203
                base_filename.append(data[0])
204
                num.append(data[1])
205
    return base_filename, num
206
207
208
def ground_truth_process(metric,nq_list, topk, num):
209
    thread_num = len(nq_list)
210
    with ProcessPoolExecutor(thread_num) as executor:
211
        for i in range(thread_num):
212
            # print("Process:",num+i)
213
            if metric == 'L2':
214
                executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i])
215
            elif metric == 'IP':
216
                executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i])
217
            elif metric == 'Tan':
218
                executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i])
219
    get_loc_txt(LOC_FILE_NAME)
220
    get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME)
221
    if GET_VEC:
222
        vec = []
223
        file, num = load_gt_file_out()
224
        for i in range(len(file)):
225
            n = int(num[i]) - 1
226
            vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i])
227
            vec.append(vectors[n])
228
        print("saved len of vec:", len(vec))
229
        np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec)
230
231
232
def main():
233
    try:
234
        opts, args = getopt.getopt(
235
            sys.argv[1:],
236
            "hlq:k:m:",
237
            ["help", "nq=", "topk=", "metric="],
238
        )
239
    except getopt.GetoptError:
240
        print("Usage: test.py [-q <nq>] -k <topk> -s")
241
        sys.exit(2)
242
    nq = 0
243
    for opt_name, opt_value in opts:
244
        if opt_name in ("-h", "--help"):
245
            print("test.py [-q <nq>] -k <topk> -l")
246
            sys.exit()
247
        elif opt_name in ("-q", "--nq"):
248
            nq = int(opt_value)
249
        elif opt_name in ("-k", "--topk"):
250
            topk = int(opt_value)
251
        elif opt_name in ("-m", "--metric"):
252
            metric = opt_value
253
        elif opt_name == "-l":    # test.py [-q <nq>] -k <topk> -m -l
254
            try:
255
                os.mkdir(GT_ALL_FOLDER_NAME)
256
            except:
257
                print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.')
258
                sys.exit()
259
            if not os.path.exists(GT_FOLDER_NAME):
260
                os.mkdir(GT_FOLDER_NAME)
261
262
            print("metric type is",metric)
0 ignored issues
show
introduced by
The variable metric does not seem to be defined for all execution paths.
Loading history...
263
            time_start = time.time()
264
            query_vectors = load_query_vec(nq)
265
            nq = len(query_vectors)
266
            print("query list:", len(query_vectors))
267
            num = math.ceil(nq/PROCESS_NUM)
268
            for i in range(num):
269
                print("start with round:",i+1)
270
                if i==num-1:
271
                    ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM)
0 ignored issues
show
introduced by
The variable topk does not seem to be defined for all execution paths.
Loading history...
272
                else:
273
                    ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM)
274
275
            time_end = time.time()
276
            time_cost = time_end - time_start
277
            print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!")
278
279
280
if __name__ == '__main__':
281
    main()
282