| Total Complexity | 68 |
| Total Lines | 282 |
| Duplicated Lines | 15.96 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like milvus_ground_truth often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import getopt |
||
| 2 | import os |
||
| 3 | import sys |
||
| 4 | import time |
||
| 5 | from concurrent.futures import ThreadPoolExecutor |
||
| 6 | from concurrent.futures import ProcessPoolExecutor |
||
| 7 | import numpy as np |
||
| 8 | import math |
||
| 9 | |||
| 10 | PROCESS_NUM = 12 |
||
| 11 | GET_VEC = False |
||
| 12 | CSV = False |
||
| 13 | UINT8 = False |
||
| 14 | |||
| 15 | BASE_FOLDER_NAME = '/data/milvus/base' |
||
| 16 | NQ_FOLDER_NAME = '/data/milvus/query' |
||
| 17 | |||
| 18 | GT_ALL_FOLDER_NAME = 'ground_truth_all' |
||
| 19 | GT_FOLDER_NAME = 'ground_truth' |
||
| 20 | LOC_FILE_NAME = 'ground_truth.txt' |
||
| 21 | FLOC_FILE_NAME = 'file_ground_truth.txt' |
||
| 22 | VEC_FILE_NAME = 'vectors.npy' |
||
| 23 | |||
| 24 | |||
| 25 | # get vectors of the files |
||
| 26 | def load_query_vec(nq, vectors=[], length=0): |
||
| 27 | filenames = os.listdir(NQ_FOLDER_NAME) |
||
| 28 | filenames.sort() |
||
| 29 | for filename in filenames: |
||
| 30 | vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename) |
||
| 31 | length += len(vec_list) |
||
| 32 | if nq!=0 and length>nq : |
||
| 33 | num = nq % len(vec_list) |
||
| 34 | vectors += vec_list[0:num] |
||
| 35 | break |
||
| 36 | vectors += vec_list |
||
| 37 | return vectors |
||
| 38 | |||
| 39 | |||
| 40 | # load vectors from filr_name and num means nq's number |
||
| 41 | def load_vec_list(file_name): |
||
| 42 | if CSV: |
||
| 43 | import pandas as pd |
||
| 44 | data = pd.read_csv(file_name, header=None) |
||
| 45 | data = np.array(data) |
||
| 46 | else: |
||
| 47 | data = np.load(file_name) |
||
| 48 | if UINT8: |
||
| 49 | data = (data + 0.5) / 255 |
||
| 50 | vec_list = data.tolist() |
||
| 51 | return vec_list |
||
| 52 | |||
| 53 | |||
| 54 | def hex_to_bin(fp): |
||
| 55 | vec=[] |
||
| 56 | length = len(fp) * 4 |
||
| 57 | bstr = str(bin(int(fp,16))) |
||
| 58 | bstr = (length-(len(bstr)-2)) * '0' + bstr[2:] |
||
| 59 | for f in bstr: |
||
| 60 | vec.append(int(f)) |
||
| 61 | return vec |
||
| 62 | |||
| 63 | |||
| 64 | def calEuclideanDistance(vec1, vec2): |
||
| 65 | vec1 = np.array(vec1) |
||
| 66 | vec2 = np.array(vec2) |
||
| 67 | dist = np.sqrt(np.sum(np.square(vec1 - vec2))) |
||
| 68 | return dist |
||
| 69 | |||
| 70 | |||
| 71 | def calInnerDistance(vec1, vec2): |
||
| 72 | vec1 = np.array(vec1) |
||
| 73 | vec2 = np.array(vec2) |
||
| 74 | dist = np.inner(vec1, vec2) |
||
| 75 | return dist |
||
| 76 | |||
| 77 | |||
| 78 | def calTanimoto(vec1, vec2): |
||
| 79 | vec1 = hex_to_bin(vec1) |
||
| 80 | vec2 = hex_to_bin(vec2) |
||
| 81 | # print(vec1,vec2) |
||
| 82 | nc = float(np.inner(vec1, vec2)) |
||
| 83 | n1 = float(np.sum(vec1)) |
||
| 84 | n2 = float(np.sum(vec2)) |
||
| 85 | dist = nc/(n1+n2-nc) |
||
| 86 | print(nc,n1,n2) |
||
| 87 | return dist |
||
| 88 | |||
| 89 | |||
| 90 | View Code Duplication | def get_ground_truth_l2(topk, idx, vct_nq): |
|
|
|
|||
| 91 | filenames = os.listdir(BASE_FOLDER_NAME) |
||
| 92 | filenames.sort() |
||
| 93 | no_dist = {} |
||
| 94 | k = 0 |
||
| 95 | for filename in filenames: |
||
| 96 | vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
||
| 97 | for j in range(len(vec_list)): |
||
| 98 | dist = calEuclideanDistance(vct_nq, vec_list[j]) |
||
| 99 | num_j = "%01d%03d%06d" % (8, k, j) |
||
| 100 | if k==0 and j<topk : |
||
| 101 | no_dist[num_j] = dist |
||
| 102 | else: |
||
| 103 | # sorted by values |
||
| 104 | max_key = max(no_dist, key=no_dist.get) |
||
| 105 | max_value = no_dist[max_key] |
||
| 106 | if dist < max_value: |
||
| 107 | m = no_dist.pop(max_key) |
||
| 108 | no_dist[num_j] = dist |
||
| 109 | k += 1 |
||
| 110 | no_dist = sorted(no_dist.items(), key=lambda x: x[1]) |
||
| 111 | print(no_dist) |
||
| 112 | save_gt_file(no_dist, idx) |
||
| 113 | |||
| 114 | |||
| 115 | View Code Duplication | def get_ground_truth_ip(topk, idx, vct_nq): |
|
| 116 | filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
||
| 117 | filenames.sort() |
||
| 118 | no_dist = {} |
||
| 119 | k = 0 |
||
| 120 | for filename in filenames: |
||
| 121 | vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
||
| 122 | for j in range(len(vec_list)): |
||
| 123 | dist = calInnerDistance(vct_nq, vec_list[j]) |
||
| 124 | num_j = "%03d%06d" % (k, j) |
||
| 125 | if k==0 and j<topk : |
||
| 126 | no_dist[num_j] = dist |
||
| 127 | else: |
||
| 128 | min_key = min(no_dist, key=no_dist.get) |
||
| 129 | min_value = no_dist[min_key] |
||
| 130 | if dist > min_value: |
||
| 131 | m = no_dist.pop(min_key) |
||
| 132 | no_dist[num_j] = dist |
||
| 133 | k += 1 |
||
| 134 | no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
||
| 135 | print(no_dist) |
||
| 136 | save_gt_file(no_dist, idx) |
||
| 137 | |||
| 138 | |||
| 139 | def get_ground_truth_tanimoto(topk, idx, vec_nq): |
||
| 140 | filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names |
||
| 141 | filenames.sort() |
||
| 142 | no_dist = {} |
||
| 143 | k = 0 |
||
| 144 | for filename in filenames: |
||
| 145 | vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename) |
||
| 146 | print(BASE_FOLDER_NAME + '/' + filename, len(vec_list)) |
||
| 147 | for j in range(len(vec_list)): |
||
| 148 | dist = calTanimoto(vec_nq, vec_list[j]) |
||
| 149 | num_j = "%03d%06d" % (k, j) |
||
| 150 | if k==0 and j<topk : |
||
| 151 | no_dist[num_j] = dist |
||
| 152 | else: |
||
| 153 | min_key = min(no_dist, key=no_dist.get) |
||
| 154 | min_value = no_dist[min_key] |
||
| 155 | if dist > min_value: |
||
| 156 | m = no_dist.pop(min_key) |
||
| 157 | no_dist[num_j] = dist |
||
| 158 | k += 1 |
||
| 159 | no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True) |
||
| 160 | print(no_dist) |
||
| 161 | save_gt_file(no_dist, idx) |
||
| 162 | |||
| 163 | |||
| 164 | def save_gt_file(no_dist, idx): |
||
| 165 | filename = "%05d" % idx + 'results.txt' |
||
| 166 | with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f: |
||
| 167 | for re in no_dist: |
||
| 168 | f.write(str(re[0]) + ' ' + str(re[1]) + '\n') |
||
| 169 | |||
| 170 | def get_loc_txt(file): |
||
| 171 | filenames = os.listdir(GT_ALL_FOLDER_NAME) |
||
| 172 | filenames.sort() |
||
| 173 | write_file = open(GT_FOLDER_NAME + '/' + file, 'w+') |
||
| 174 | for f in filenames: |
||
| 175 | for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'): |
||
| 176 | write_file.write(line) |
||
| 177 | write_file.write('\n') |
||
| 178 | |||
| 179 | |||
| 180 | def get_file_loc_txt(gt_file, fnames_file): |
||
| 181 | filenames = os.listdir(BASE_FOLDER_NAME) |
||
| 182 | filenames.sort() |
||
| 183 | with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f: |
||
| 184 | with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f: |
||
| 185 | for line in gt_f: |
||
| 186 | if line != '\n': |
||
| 187 | line = line.split()[0] |
||
| 188 | loca = int(line[1:4]) |
||
| 189 | offset = int(line[4:10]) |
||
| 190 | fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n') |
||
| 191 | else: |
||
| 192 | fnames_f.write(line) |
||
| 193 | |||
| 194 | |||
| 195 | def load_gt_file_out(): |
||
| 196 | file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME |
||
| 197 | base_filename = [] |
||
| 198 | num = [] |
||
| 199 | with open(file_name, 'r') as f: |
||
| 200 | for line in f.readlines(): |
||
| 201 | data = line.split() |
||
| 202 | if data: |
||
| 203 | base_filename.append(data[0]) |
||
| 204 | num.append(data[1]) |
||
| 205 | return base_filename, num |
||
| 206 | |||
| 207 | |||
| 208 | def ground_truth_process(metric,nq_list, topk, num): |
||
| 209 | thread_num = len(nq_list) |
||
| 210 | with ProcessPoolExecutor(thread_num) as executor: |
||
| 211 | for i in range(thread_num): |
||
| 212 | # print("Process:",num+i) |
||
| 213 | if metric == 'L2': |
||
| 214 | executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i]) |
||
| 215 | elif metric == 'IP': |
||
| 216 | executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i]) |
||
| 217 | elif metric == 'Tan': |
||
| 218 | executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i]) |
||
| 219 | get_loc_txt(LOC_FILE_NAME) |
||
| 220 | get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME) |
||
| 221 | if GET_VEC: |
||
| 222 | vec = [] |
||
| 223 | file, num = load_gt_file_out() |
||
| 224 | for i in range(len(file)): |
||
| 225 | n = int(num[i]) - 1 |
||
| 226 | vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i]) |
||
| 227 | vec.append(vectors[n]) |
||
| 228 | print("saved len of vec:", len(vec)) |
||
| 229 | np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec) |
||
| 230 | |||
| 231 | |||
| 232 | def main(): |
||
| 233 | try: |
||
| 234 | opts, args = getopt.getopt( |
||
| 235 | sys.argv[1:], |
||
| 236 | "hlq:k:m:", |
||
| 237 | ["help", "nq=", "topk=", "metric="], |
||
| 238 | ) |
||
| 239 | except getopt.GetoptError: |
||
| 240 | print("Usage: test.py [-q <nq>] -k <topk> -s") |
||
| 241 | sys.exit(2) |
||
| 242 | nq = 0 |
||
| 243 | for opt_name, opt_value in opts: |
||
| 244 | if opt_name in ("-h", "--help"): |
||
| 245 | print("test.py [-q <nq>] -k <topk> -l") |
||
| 246 | sys.exit() |
||
| 247 | elif opt_name in ("-q", "--nq"): |
||
| 248 | nq = int(opt_value) |
||
| 249 | elif opt_name in ("-k", "--topk"): |
||
| 250 | topk = int(opt_value) |
||
| 251 | elif opt_name in ("-m", "--metric"): |
||
| 252 | metric = opt_value |
||
| 253 | elif opt_name == "-l": # test.py [-q <nq>] -k <topk> -m -l |
||
| 254 | try: |
||
| 255 | os.mkdir(GT_ALL_FOLDER_NAME) |
||
| 256 | except: |
||
| 257 | print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.') |
||
| 258 | sys.exit() |
||
| 259 | if not os.path.exists(GT_FOLDER_NAME): |
||
| 260 | os.mkdir(GT_FOLDER_NAME) |
||
| 261 | |||
| 262 | print("metric type is",metric) |
||
| 263 | time_start = time.time() |
||
| 264 | query_vectors = load_query_vec(nq) |
||
| 265 | nq = len(query_vectors) |
||
| 266 | print("query list:", len(query_vectors)) |
||
| 267 | num = math.ceil(nq/PROCESS_NUM) |
||
| 268 | for i in range(num): |
||
| 269 | print("start with round:",i+1) |
||
| 270 | if i==num-1: |
||
| 271 | ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM) |
||
| 272 | else: |
||
| 273 | ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM) |
||
| 274 | |||
| 275 | time_end = time.time() |
||
| 276 | time_cost = time_end - time_start |
||
| 277 | print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!") |
||
| 278 | |||
| 279 | |||
| 280 | if __name__ == '__main__': |
||
| 281 | main() |
||
| 282 |