Total Complexity | 68 |
Total Lines | 282 |
Duplicated Lines | 15.96 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like milvus_ground_truth often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | import getopt |
||
2 | import os |
||
3 | import sys |
||
4 | import time |
||
5 | from concurrent.futures import ThreadPoolExecutor |
||
6 | from concurrent.futures import ProcessPoolExecutor |
||
7 | import numpy as np |
||
8 | import math |
||
9 | |||
10 | PROCESS_NUM = 12 |
||
11 | GET_VEC = False |
||
12 | CSV = False |
||
13 | UINT8 = False |
||
14 | |||
15 | BASE_FOLDER_NAME = '/data/milvus/base' |
||
16 | NQ_FOLDER_NAME = '/data/milvus/query' |
||
17 | |||
18 | GT_ALL_FOLDER_NAME = 'ground_truth_all' |
||
19 | GT_FOLDER_NAME = 'ground_truth' |
||
20 | LOC_FILE_NAME = 'ground_truth.txt' |
||
21 | FLOC_FILE_NAME = 'file_ground_truth.txt' |
||
22 | VEC_FILE_NAME = 'vectors.npy' |
||
23 | |||
24 | |||
25 | # get vectors of the files |
||
26 | def load_query_vec(nq, vectors=[], length=0): |
||
27 | filenames = os.listdir(NQ_FOLDER_NAME) |
||
28 | filenames.sort() |
||
29 | for filename in filenames: |
||
30 | vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename) |
||
31 | length += len(vec_list) |
||
32 | if nq!=0 and length>nq : |
||
33 | num = nq % len(vec_list) |
||
34 | vectors += vec_list[0:num] |
||
35 | break |
||
36 | vectors += vec_list |
||
37 | return vectors |
||
38 | |||
39 | |||
40 | # load vectors from filr_name and num means nq's number |
||
41 | def load_vec_list(file_name): |
||
42 | if CSV: |
||
43 | import pandas as pd |
||
44 | data = pd.read_csv(file_name, header=None) |
||
45 | data = np.array(data) |
||
46 | else: |
||
47 | data = np.load(file_name) |
||
48 | if UINT8: |
||
49 | data = (data + 0.5) / 255 |
||
50 | vec_list = data.tolist() |
||
51 | return vec_list |
||
52 | |||
53 | |||
54 | def hex_to_bin(fp): |
||
55 | vec=[] |
||
56 | length = len(fp) * 4 |
||
57 | bstr = str(bin(int(fp,16))) |
||
58 | bstr = (length-(len(bstr)-2)) * '0' + bstr[2:] |
||
59 | for f in bstr: |
||
60 | vec.append(int(f)) |
||
61 | return vec |
||
62 | |||
63 | |||
64 | def calEuclideanDistance(vec1, vec2): |
||
65 | vec1 = np.array(vec1) |
||
66 | vec2 = np.array(vec2) |
||
67 | dist = np.sqrt(np.sum(np.square(vec1 - vec2))) |
||
68 | return dist |
||
69 | |||
70 | |||
71 | def calInnerDistance(vec1, vec2): |
||
72 | vec1 = np.array(vec1) |
||
73 | vec2 = np.array(vec2) |
||
74 | dist = np.inner(vec1, vec2) |
||
75 | return dist |
||
76 | |||
77 | |||
78 | def calTanimoto(vec1, vec2): |
||
79 | vec1 = hex_to_bin(vec1) |
||
80 | vec2 = hex_to_bin(vec2) |
||
81 | # print(vec1,vec2) |
||
82 | nc = float(np.inner(vec1, vec2)) |
||
83 | n1 = float(np.sum(vec1)) |
||
84 | n2 = float(np.sum(vec2)) |
||
85 | dist = nc/(n1+n2-nc) |
||
86 | print(nc,n1,n2) |
||
87 | return dist |
||
88 | |||
89 | |||
90 | View Code Duplication | def get_ground_truth_l2(topk, idx, vct_nq): |
|
|
|||
91 | filenames = os.listdir(BASE_FOLDER_NAME) |
||
92 | filenames.sort() |
||
93 | no_dist = {} |
||
94 | k = 0 |
||
95 | for filename in filenames: |
||
96 | vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
||
97 | for j in range(len(vec_list)): |
||
98 | dist = calEuclideanDistance(vct_nq, vec_list[j]) |
||
99 | num_j = "%01d%03d%06d" % (8, k, j) |
||
100 | if k==0 and j<topk : |
||
101 | no_dist[num_j] = dist |
||
102 | else: |
||
103 | # sorted by values |
||
104 | max_key = max(no_dist, key=no_dist.get) |
||
105 | max_value = no_dist[max_key] |
||
106 | if dist < max_value: |
||
107 | m = no_dist.pop(max_key) |
||
108 | no_dist[num_j] = dist |
||
109 | k += 1 |
||
110 | no_dist = sorted(no_dist.items(), key=lambda x: x[1]) |
||
111 | print(no_dist) |
||
112 | save_gt_file(no_dist, idx) |
||
113 | |||
114 | |||
115 | View Code Duplication | def get_ground_truth_ip(topk, idx, vct_nq): |
|
116 | filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
||
117 | filenames.sort() |
||
118 | no_dist = {} |
||
119 | k = 0 |
||
120 | for filename in filenames: |
||
121 | vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
||
122 | for j in range(len(vec_list)): |
||
123 | dist = calInnerDistance(vct_nq, vec_list[j]) |
||
124 | num_j = "%03d%06d" % (k, j) |
||
125 | if k==0 and j<topk : |
||
126 | no_dist[num_j] = dist |
||
127 | else: |
||
128 | min_key = min(no_dist, key=no_dist.get) |
||
129 | min_value = no_dist[min_key] |
||
130 | if dist > min_value: |
||
131 | m = no_dist.pop(min_key) |
||
132 | no_dist[num_j] = dist |
||
133 | k += 1 |
||
134 | no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
||
135 | print(no_dist) |
||
136 | save_gt_file(no_dist, idx) |
||
137 | |||
138 | |||
139 | def get_ground_truth_tanimoto(topk, idx, vec_nq): |
||
140 | filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
||
141 | filenames.sort() |
||
142 | no_dist = {} |
||
143 | k = 0 |
||
144 | for filename in filenames: |
||
145 | vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
||
146 | print(BASE_FOLDER_NAME + '/' + filename, len(vec_list)) |
||
147 | for j in range(len(vec_list)): |
||
148 | dist = calTanimoto(vec_nq, vec_list[j]) |
||
149 | num_j = "%03d%06d" % (k, j) |
||
150 | if k==0 and j<topk : |
||
151 | no_dist[num_j] = dist |
||
152 | else: |
||
153 | min_key = min(no_dist, key=no_dist.get) |
||
154 | min_value = no_dist[min_key] |
||
155 | if dist > min_value: |
||
156 | m = no_dist.pop(min_key) |
||
157 | no_dist[num_j] = dist |
||
158 | k += 1 |
||
159 | no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
||
160 | print(no_dist) |
||
161 | save_gt_file(no_dist, idx) |
||
162 | |||
163 | |||
164 | def save_gt_file(no_dist, idx): |
||
165 | filename = "%05d" % idx + 'results.txt' |
||
166 | with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f: |
||
167 | for re in no_dist: |
||
168 | f.write(str(re[0]) + ' ' + str(re[1]) + '\n') |
||
169 | |||
170 | def get_loc_txt(file): |
||
171 | filenames = os.listdir(GT_ALL_FOLDER_NAME) |
||
172 | filenames.sort() |
||
173 | write_file = open(GT_FOLDER_NAME + '/' + file, 'w+') |
||
174 | for f in filenames: |
||
175 | for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'): |
||
176 | write_file.write(line) |
||
177 | write_file.write('\n') |
||
178 | |||
179 | |||
180 | def get_file_loc_txt(gt_file, fnames_file): |
||
181 | filenames = os.listdir(BASE_FOLDER_NAME) |
||
182 | filenames.sort() |
||
183 | with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f: |
||
184 | with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f: |
||
185 | for line in gt_f: |
||
186 | if line != '\n': |
||
187 | line = line.split()[0] |
||
188 | loca = int(line[1:4]) |
||
189 | offset = int(line[4:10]) |
||
190 | fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n') |
||
191 | else: |
||
192 | fnames_f.write(line) |
||
193 | |||
194 | |||
195 | def load_gt_file_out(): |
||
196 | file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME |
||
197 | base_filename = [] |
||
198 | num = [] |
||
199 | with open(file_name, 'r') as f: |
||
200 | for line in f.readlines(): |
||
201 | data = line.split() |
||
202 | if data: |
||
203 | base_filename.append(data[0]) |
||
204 | num.append(data[1]) |
||
205 | return base_filename, num |
||
206 | |||
207 | |||
208 | def ground_truth_process(metric,nq_list, topk, num): |
||
209 | thread_num = len(nq_list) |
||
210 | with ProcessPoolExecutor(thread_num) as executor: |
||
211 | for i in range(thread_num): |
||
212 | # print("Process:",num+i) |
||
213 | if metric == 'L2': |
||
214 | executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i]) |
||
215 | elif metric == 'IP': |
||
216 | executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i]) |
||
217 | elif metric == 'Tan': |
||
218 | executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i]) |
||
219 | get_loc_txt(LOC_FILE_NAME) |
||
220 | get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME) |
||
221 | if GET_VEC: |
||
222 | vec = [] |
||
223 | file, num = load_gt_file_out() |
||
224 | for i in range(len(file)): |
||
225 | n = int(num[i]) - 1 |
||
226 | vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i]) |
||
227 | vec.append(vectors[n]) |
||
228 | print("saved len of vec:", len(vec)) |
||
229 | np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec) |
||
230 | |||
231 | |||
232 | def main(): |
||
233 | try: |
||
234 | opts, args = getopt.getopt( |
||
235 | sys.argv[1:], |
||
236 | "hlq:k:m:", |
||
237 | ["help", "nq=", "topk=", "metric="], |
||
238 | ) |
||
239 | except getopt.GetoptError: |
||
240 | print("Usage: test.py [-q <nq>] -k <topk> -s") |
||
241 | sys.exit(2) |
||
242 | nq = 0 |
||
243 | for opt_name, opt_value in opts: |
||
244 | if opt_name in ("-h", "--help"): |
||
245 | print("test.py [-q <nq>] -k <topk> -l") |
||
246 | sys.exit() |
||
247 | elif opt_name in ("-q", "--nq"): |
||
248 | nq = int(opt_value) |
||
249 | elif opt_name in ("-k", "--topk"): |
||
250 | topk = int(opt_value) |
||
251 | elif opt_name in ("-m", "--metric"): |
||
252 | metric = opt_value |
||
253 | elif opt_name == "-l": # test.py [-q <nq>] -k <topk> -m -l |
||
254 | try: |
||
255 | os.mkdir(GT_ALL_FOLDER_NAME) |
||
256 | except: |
||
257 | print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.') |
||
258 | sys.exit() |
||
259 | if not os.path.exists(GT_FOLDER_NAME): |
||
260 | os.mkdir(GT_FOLDER_NAME) |
||
261 | |||
262 | print("metric type is",metric) |
||
263 | time_start = time.time() |
||
264 | query_vectors = load_query_vec(nq) |
||
265 | nq = len(query_vectors) |
||
266 | print("query list:", len(query_vectors)) |
||
267 | num = math.ceil(nq/PROCESS_NUM) |
||
268 | for i in range(num): |
||
269 | print("start with round:",i+1) |
||
270 | if i==num-1: |
||
271 | ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM) |
||
272 | else: |
||
273 | ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM) |
||
274 | |||
275 | time_end = time.time() |
||
276 | time_cost = time_end - time_start |
||
277 | print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!") |
||
278 | |||
279 | |||
280 | if __name__ == '__main__': |
||
281 | main() |
||
282 |