milvus_ground_truth - Code Metrics - milvus-io/milvus - Measure and Improve Code Quality continuously with Scrutinizer

milvus_ground_truth F
last analyzed 2021-04-15 09:02 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	282
Duplicated Lines	15.96 %

Importance

Changes

Metric	Value
eloc	235
dl	45
loc	282
rs	2.96
c	0
b	0
f	0
wmc	68

15 Functions

Rating	Name	Duplication	Size	Complexity
A	save_gt_file()	0	5	3
A	load_query_vec()	0	12	4
D	main()	0	46	12
A	calInnerDistance()	0	5	1
B	ground_truth_process()	0	22	8
B	get_ground_truth_l2()	23	23	7
A	hex_to_bin()	0	8	2
A	load_gt_file_out()	0	11	4
A	load_vec_list()	0	11	3
A	calTanimoto()	0	10	1
B	get_ground_truth_tanimoto()	0	23	7
A	get_loc_txt()	0	8	3
A	calEuclideanDistance()	0	5	1
A	get_file_loc_txt()	0	13	5
B	get_ground_truth_ip()	22	22	7

How to fix Duplicated Code Complexity

import getopt
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import math

PROCESS_NUM = 12
GET_VEC = False
CSV = False
UINT8 = False

BASE_FOLDER_NAME = '/data/milvus/base'
NQ_FOLDER_NAME = '/data/milvus/query'

GT_ALL_FOLDER_NAME = 'ground_truth_all'
GT_FOLDER_NAME = 'ground_truth'
LOC_FILE_NAME = 'ground_truth.txt'
FLOC_FILE_NAME = 'file_ground_truth.txt'
VEC_FILE_NAME = 'vectors.npy'


# get vectors of the files
def load_query_vec(nq, vectors=[], length=0):
    filenames = os.listdir(NQ_FOLDER_NAME)
    filenames.sort()
    for filename in filenames:
        vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename)
        length += len(vec_list)
        if nq!=0 and length>nq :
            num = nq % len(vec_list)
            vectors += vec_list[0:num]
            break
        vectors += vec_list
    return vectors


# load vectors from filr_name and num means nq's number
def load_vec_list(file_name):
    if CSV:
        import pandas as pd
        data = pd.read_csv(file_name, header=None)
        data = np.array(data)
    else:
        data = np.load(file_name)
    if UINT8:
        data = (data + 0.5) / 255
    vec_list = data.tolist()
    return vec_list


def hex_to_bin(fp):
    vec=[]
    length = len(fp) * 4
    bstr = str(bin(int(fp,16)))
    bstr = (length-(len(bstr)-2)) * '0' + bstr[2:]
    for f in bstr:
        vec.append(int(f))
    return vec


def calEuclideanDistance(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    dist = np.sqrt(np.sum(np.square(vec1 - vec2)))
    return dist


def calInnerDistance(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    dist = np.inner(vec1, vec2)
    return dist


def calTanimoto(vec1, vec2):
    vec1 = hex_to_bin(vec1)
    vec2 = hex_to_bin(vec2)
    # print(vec1,vec2)
    nc = float(np.inner(vec1, vec2))
    n1 = float(np.sum(vec1))
    n2 = float(np.sum(vec2))
    dist = nc/(n1+n2-nc)
    print(nc,n1,n2)
    return dist


def get_ground_truth_l2(topk, idx, vct_nq):

    filenames = os.listdir(BASE_FOLDER_NAME)
    filenames.sort()
    no_dist = {}
    k = 0
    for filename in filenames:
        vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
        for j in range(len(vec_list)):
            dist = calEuclideanDistance(vct_nq, vec_list[j])
            num_j = "%01d%03d%06d" % (8, k, j)
            if k==0 and j<topk :
                no_dist[num_j] = dist
            else:
                # sorted by values
                max_key = max(no_dist, key=no_dist.get)
                max_value = no_dist[max_key]
                if dist < max_value:
                    m = no_dist.pop(max_key)
                    no_dist[num_j] = dist
        k += 1
    no_dist = sorted(no_dist.items(), key=lambda x: x[1])
    print(no_dist)
    save_gt_file(no_dist, idx)


def get_ground_truth_ip(topk, idx, vct_nq):

    filenames = os.listdir(BASE_FOLDER_NAME)  # get the whole file names
    filenames.sort()
    no_dist = {}
    k = 0
    for filename in filenames:
        vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
        for j in range(len(vec_list)):
            dist = calInnerDistance(vct_nq, vec_list[j])
            num_j = "%03d%06d" % (k, j)
            if k==0 and j<topk :
                no_dist[num_j] = dist
            else:
                min_key = min(no_dist, key=no_dist.get)
                min_value = no_dist[min_key]
                if dist > min_value:
                    m = no_dist.pop(min_key)
                    no_dist[num_j] = dist
        k += 1
    no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
    print(no_dist)
    save_gt_file(no_dist, idx)


def get_ground_truth_tanimoto(topk, idx, vec_nq):
    filenames = os.listdir(BASE_FOLDER_NAME)  # get the whole file names
    filenames.sort()
    no_dist = {}
    k = 0
    for filename in filenames:
        vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
        print(BASE_FOLDER_NAME + '/' + filename, len(vec_list))
        for j in range(len(vec_list)):
            dist = calTanimoto(vec_nq, vec_list[j])
            num_j = "%03d%06d" % (k, j)
            if k==0 and j<topk :
                no_dist[num_j] = dist
            else:
                min_key = min(no_dist, key=no_dist.get)
                min_value = no_dist[min_key]
                if dist > min_value:
                    m = no_dist.pop(min_key)
                    no_dist[num_j] = dist
        k += 1
    no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
    print(no_dist)
    save_gt_file(no_dist, idx)


def save_gt_file(no_dist, idx):
    filename = "%05d" % idx + 'results.txt'
    with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f:
        for re in no_dist:
            f.write(str(re[0]) + ' ' + str(re[1]) + '\n')

def get_loc_txt(file):
    filenames = os.listdir(GT_ALL_FOLDER_NAME)
    filenames.sort()
    write_file = open(GT_FOLDER_NAME + '/' + file, 'w+')
    for f in filenames:
        for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'):
            write_file.write(line)
        write_file.write('\n')


def get_file_loc_txt(gt_file, fnames_file):
    filenames = os.listdir(BASE_FOLDER_NAME)
    filenames.sort()
    with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f:
        with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f:
            for line in gt_f:
                if line != '\n':
                    line = line.split()[0]
                    loca = int(line[1:4])
                    offset = int(line[4:10])
                    fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n')
                else:
                    fnames_f.write(line)


def load_gt_file_out():
    file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME
    base_filename = []
    num = []
    with open(file_name, 'r') as f:
        for line in f.readlines():
            data = line.split()
            if data:
                base_filename.append(data[0])
                num.append(data[1])
    return base_filename, num


def ground_truth_process(metric,nq_list, topk, num):
    thread_num = len(nq_list)
    with ProcessPoolExecutor(thread_num) as executor:
        for i in range(thread_num):
            # print("Process:",num+i)
            if metric == 'L2':
                executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i])
            elif metric == 'IP':
                executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i])
            elif metric == 'Tan':
                executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i])
    get_loc_txt(LOC_FILE_NAME)
    get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME)
    if GET_VEC:
        vec = []
        file, num = load_gt_file_out()
        for i in range(len(file)):
            n = int(num[i]) - 1
            vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i])
            vec.append(vectors[n])
        print("saved len of vec:", len(vec))
        np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec)


def main():
    try:
        opts, args = getopt.getopt(
            sys.argv[1:],
            "hlq:k:m:",
            ["help", "nq=", "topk=", "metric="],
        )
    except getopt.GetoptError:
        print("Usage: test.py [-q <nq>] -k <topk> -s")
        sys.exit(2)
    nq = 0
    for opt_name, opt_value in opts:
        if opt_name in ("-h", "--help"):
            print("test.py [-q <nq>] -k <topk> -l")
            sys.exit()
        elif opt_name in ("-q", "--nq"):
            nq = int(opt_value)
        elif opt_name in ("-k", "--topk"):
            topk = int(opt_value)
        elif opt_name in ("-m", "--metric"):
            metric = opt_value
        elif opt_name == "-l":    # test.py [-q <nq>] -k <topk> -m -l
            try:
                os.mkdir(GT_ALL_FOLDER_NAME)
            except:
                print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.')
                sys.exit()
            if not os.path.exists(GT_FOLDER_NAME):
                os.mkdir(GT_FOLDER_NAME)

            print("metric type is",metric)

            time_start = time.time()
            query_vectors = load_query_vec(nq)
            nq = len(query_vectors)
            print("query list:", len(query_vectors))
            num = math.ceil(nq/PROCESS_NUM)
            for i in range(num):
                print("start with round:",i+1)
                if i==num-1:
                    ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM)

                else:
                    ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM)

            time_end = time.time()
            time_cost = time_end - time_start
            print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!")


if __name__ == '__main__':
    main()


1		import getopt
2		import os
3		import sys
4		import time
5		from concurrent.futures import ThreadPoolExecutor
6		from concurrent.futures import ProcessPoolExecutor
7		import numpy as np
8		import math
9
10		PROCESS_NUM = 12
11		GET_VEC = False
12		CSV = False
13		UINT8 = False
14
15		BASE_FOLDER_NAME = '/data/milvus/base'
16		NQ_FOLDER_NAME = '/data/milvus/query'
17
18		GT_ALL_FOLDER_NAME = 'ground_truth_all'
19		GT_FOLDER_NAME = 'ground_truth'
20		LOC_FILE_NAME = 'ground_truth.txt'
21		FLOC_FILE_NAME = 'file_ground_truth.txt'
22		VEC_FILE_NAME = 'vectors.npy'
23
24
25		# get vectors of the files
26		def load_query_vec(nq, vectors=[], length=0):
27		filenames = os.listdir(NQ_FOLDER_NAME)
28		filenames.sort()
29		for filename in filenames:
30		vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename)
31		length += len(vec_list)
32		if nq!=0 and length>nq :
33		num = nq % len(vec_list)
34		vectors += vec_list[0:num]
35		break
36		vectors += vec_list
37		return vectors
38
39
40		# load vectors from filr_name and num means nq's number
41		def load_vec_list(file_name):
42		if CSV:
43		import pandas as pd
44		data = pd.read_csv(file_name, header=None)
45		data = np.array(data)
46		else:
47		data = np.load(file_name)
48		if UINT8:
49		data = (data + 0.5) / 255
50		vec_list = data.tolist()
51		return vec_list
52
53
54		def hex_to_bin(fp):
55		vec=[]
56		length = len(fp) * 4
57		bstr = str(bin(int(fp,16)))
58		bstr = (length-(len(bstr)-2)) * '0' + bstr[2:]
59		for f in bstr:
60		vec.append(int(f))
61		return vec
62
63
64		def calEuclideanDistance(vec1, vec2):
65		vec1 = np.array(vec1)
66		vec2 = np.array(vec2)
67		dist = np.sqrt(np.sum(np.square(vec1 - vec2)))
68		return dist
69
70
71		def calInnerDistance(vec1, vec2):
72		vec1 = np.array(vec1)
73		vec2 = np.array(vec2)
74		dist = np.inner(vec1, vec2)
75		return dist
76
77
78		def calTanimoto(vec1, vec2):
79		vec1 = hex_to_bin(vec1)
80		vec2 = hex_to_bin(vec2)
81		# print(vec1,vec2)
82		nc = float(np.inner(vec1, vec2))
83		n1 = float(np.sum(vec1))
84		n2 = float(np.sum(vec2))
85		dist = nc/(n1+n2-nc)
86		print(nc,n1,n2)
87		return dist
88
89
90	View Code Duplication	def get_ground_truth_l2(topk, idx, vct_nq):
		0 ignored issues – show Duplication introduced 2020-02-23 17:22 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
91		filenames = os.listdir(BASE_FOLDER_NAME)
92		filenames.sort()
93		no_dist = {}
94		k = 0
95		for filename in filenames:
96		vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
97		for j in range(len(vec_list)):
98		dist = calEuclideanDistance(vct_nq, vec_list[j])
99		num_j = "%01d%03d%06d" % (8, k, j)
100		if k==0 and j<topk :
101		no_dist[num_j] = dist
102		else:
103		# sorted by values
104		max_key = max(no_dist, key=no_dist.get)
105		max_value = no_dist[max_key]
106		if dist < max_value:
107		m = no_dist.pop(max_key)
108		no_dist[num_j] = dist
109		k += 1
110		no_dist = sorted(no_dist.items(), key=lambda x: x[1])
111		print(no_dist)
112		save_gt_file(no_dist, idx)
113
114
115	View Code Duplication	def get_ground_truth_ip(topk, idx, vct_nq):
		0 ignored issues – show Duplication introduced 2020-02-23 17:22 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
116		filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names
117		filenames.sort()
118		no_dist = {}
119		k = 0
120		for filename in filenames:
121		vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
122		for j in range(len(vec_list)):
123		dist = calInnerDistance(vct_nq, vec_list[j])
124		num_j = "%03d%06d" % (k, j)
125		if k==0 and j<topk :
126		no_dist[num_j] = dist
127		else:
128		min_key = min(no_dist, key=no_dist.get)
129		min_value = no_dist[min_key]
130		if dist > min_value:
131		m = no_dist.pop(min_key)
132		no_dist[num_j] = dist
133		k += 1
134		no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
135		print(no_dist)
136		save_gt_file(no_dist, idx)
137
138
139		def get_ground_truth_tanimoto(topk, idx, vec_nq):
140		filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names
141		filenames.sort()
142		no_dist = {}
143		k = 0
144		for filename in filenames:
145		vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
146		print(BASE_FOLDER_NAME + '/' + filename, len(vec_list))
147		for j in range(len(vec_list)):
148		dist = calTanimoto(vec_nq, vec_list[j])
149		num_j = "%03d%06d" % (k, j)
150		if k==0 and j<topk :
151		no_dist[num_j] = dist
152		else:
153		min_key = min(no_dist, key=no_dist.get)
154		min_value = no_dist[min_key]
155		if dist > min_value:
156		m = no_dist.pop(min_key)
157		no_dist[num_j] = dist
158		k += 1
159		no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
160		print(no_dist)
161		save_gt_file(no_dist, idx)
162
163
164		def save_gt_file(no_dist, idx):
165		filename = "%05d" % idx + 'results.txt'
166		with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f:
167		for re in no_dist:
168		f.write(str(re[0]) + ' ' + str(re[1]) + '\n')
169
170		def get_loc_txt(file):
171		filenames = os.listdir(GT_ALL_FOLDER_NAME)
172		filenames.sort()
173		write_file = open(GT_FOLDER_NAME + '/' + file, 'w+')
174		for f in filenames:
175		for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'):
176		write_file.write(line)
177		write_file.write('\n')
178
179
180		def get_file_loc_txt(gt_file, fnames_file):
181		filenames = os.listdir(BASE_FOLDER_NAME)
182		filenames.sort()
183		with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f:
184		with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f:
185		for line in gt_f:
186		if line != '\n':
187		line = line.split()[0]
188		loca = int(line[1:4])
189		offset = int(line[4:10])
190		fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n')
191		else:
192		fnames_f.write(line)
193
194
195		def load_gt_file_out():
196		file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME
197		base_filename = []
198		num = []
199		with open(file_name, 'r') as f:
200		for line in f.readlines():
201		data = line.split()
202		if data:
203		base_filename.append(data[0])
204		num.append(data[1])
205		return base_filename, num
206
207
208		def ground_truth_process(metric,nq_list, topk, num):
209		thread_num = len(nq_list)
210		with ProcessPoolExecutor(thread_num) as executor:
211		for i in range(thread_num):
212		# print("Process:",num+i)
213		if metric == 'L2':
214		executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i])
215		elif metric == 'IP':
216		executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i])
217		elif metric == 'Tan':
218		executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i])
219		get_loc_txt(LOC_FILE_NAME)
220		get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME)
221		if GET_VEC:
222		vec = []
223		file, num = load_gt_file_out()
224		for i in range(len(file)):
225		n = int(num[i]) - 1
226		vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i])
227		vec.append(vectors[n])
228		print("saved len of vec:", len(vec))
229		np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec)
230
231
232		def main():
233		try:
234		opts, args = getopt.getopt(
235		sys.argv[1:],
236		"hlq:k:m:",
237		["help", "nq=", "topk=", "metric="],
238		)
239		except getopt.GetoptError:
240		print("Usage: test.py [-q <nq>] -k <topk> -s")
241		sys.exit(2)
242		nq = 0
243		for opt_name, opt_value in opts:
244		if opt_name in ("-h", "--help"):
245		print("test.py [-q <nq>] -k <topk> -l")
246		sys.exit()
247		elif opt_name in ("-q", "--nq"):
248		nq = int(opt_value)
249		elif opt_name in ("-k", "--topk"):
250		topk = int(opt_value)
251		elif opt_name in ("-m", "--metric"):
252		metric = opt_value
253		elif opt_name == "-l": # test.py [-q <nq>] -k <topk> -m -l
254		try:
255		os.mkdir(GT_ALL_FOLDER_NAME)
256		except:
257		print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.')
258		sys.exit()
259		if not os.path.exists(GT_FOLDER_NAME):
260		os.mkdir(GT_FOLDER_NAME)
261
262		print("metric type is",metric)
		0 ignored issues – show introduced 2020-02-23 17:22 UTC by Report Bug Copy Issue Report The variable `metric` does not seem to be defined for all execution paths. Loading history...
263		time_start = time.time()
264		query_vectors = load_query_vec(nq)
265		nq = len(query_vectors)
266		print("query list:", len(query_vectors))
267		num = math.ceil(nq/PROCESS_NUM)
268		for i in range(num):
269		print("start with round:",i+1)
270		if i==num-1:
271		ground_truth_process(metric, query_vectors[iPROCESS_NUM:nq], topk, iPROCESS_NUM)
		0 ignored issues – show introduced 2020-02-23 17:22 UTC by Report Bug Copy Issue Report The variable `topk` does not seem to be defined for all execution paths. Loading history...
272		else:
273		ground_truth_process(metric, query_vectors[iPROCESS_NUM:iPROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM)
274
275		time_end = time.time()
276		time_cost = time_end - time_start
277		print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!")
278
279
280		if __name__ == '__main__':
281		main()
282

milvus-io / milvus

milvus_ground_truth F last analyzed 2021-04-15 09:02 UTC

Complexity

Size/Duplication

Importance

15 Functions

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like

milvus_ground_truth F
last analyzed 2021-04-15 09:02 UTC