training_and_test() - Code Metrics - Inspection of "example: Add scripts that trys to find best hyperp..." - TinghuiWang/pyActLearn - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 2384cb...f17ea4 )

by Tinghui

created 2017-02-08 05:30 UTC

training_and_test() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	13
Bugs	0	Features	0

Metric	Value
cc	3
c	13
b	0
f	0
dl	0
loc	26
rs	8.8571

import os
import pickle
import logging
import argparse
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from pyActLearn.CASAS.data import CASASData
from pyActLearn.CASAS.fuel import CASASFuel
from pyActLearn.performance.record import LearningResult
from pyActLearn.performance import get_confusion_matrix

logger = logging.getLogger(__file__)


def training_and_test(token, train_data, test_data, num_classes, result):
    """Train and test

    Args:
        token (:obj:`str`): token representing this run
        train_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of training feature and label
        test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label
        num_classes (:obj:`int`): Number of classes
        result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result
    """
    model = RandomForestClassifier(n_estimators=20, criterion="entropy")
    model.fit(train_data[0], train_data[1].flatten())
    # Test
    predicted_y = model.predict(test_data[0])
    predicted_proba = model.predict_proba(test_data[0])
    # Evaluate the Test and Store Result
    confusion_matrix = get_confusion_matrix(num_classes=num_classes,
                                            label=test_data[1].flatten(), predicted=predicted_y)
    result.add_record(model.get_params(), key=token, confusion_matrix=confusion_matrix)
    # In case any label is missing, populate it
    if predicted_proba.shape[1] != num_classes:
        temp_array = np.zeros((predicted_proba.shape[0], num_classes), np.float32)
        for i in range(len(model.classes_)):
            temp_array[:, model.classes_[i]] = predicted_proba[:, i]
        predicted_proba = temp_array
    return predicted_y, predicted_proba


def load_and_test(token, test_data, num_classes, result):
    """Load and test

    Args:
        token (:obj:`str`): token representing this run
        test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label
        num_classes (:obj:`int`): Number of classes
        result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result
    """
    model = RandomForestClassifier(n_estimators=20, criterion="entropy")
    model.set_params(result.get_record_by_key(token)['model'])
    # Test
    predicted_y = model.predict(test_data[0])
    predicted_proba = model.predict_proba(test_data[0])
    return predicted_y, predicted_proba

if __name__ == '__main__':

    args_ok = False
    parser = argparse.ArgumentParser(description='Run Decision Tree on single resident CASAS datasets.')
    parser.add_argument('-d', '--dataset', help='Directory to original datasets')
    parser.add_argument('-o', '--output', help='Output folder')
    parser.add_argument('--h5py', help='HDF5 dataset folder')
    args = parser.parse_args()
    # Default parameters
    log_filename = os.path.basename(__file__).split('.')[0] + \
                   '-%s.log' % datetime.now().strftime('%y%m%d_%H:%M:%S')
    # Setup output directory
    output_dir = args.output
    if output_dir is not None:
        output_dir = os.path.abspath(os.path.expanduser(output_dir))
        if os.path.exists(output_dir):
            # Found output_dir, check if it is a directory
            if not os.path.isdir(output_dir):
                exit('Output directory %s is found, but not a directory. Abort.' % output_dir)
        else:
            # Create directory
            os.makedirs(output_dir)
    else:
        output_dir = '.'
    # If dataset is specified, update h5py
    casas_data_dir = args.dataset
    if casas_data_dir is not None:
        casas_data_dir = os.path.abspath(os.path.expanduser(casas_data_dir))
        if not os.path.isdir(casas_data_dir):
            exit('CASAS dataset at %s does not exist. Abort.' % casas_data_dir)
    # Find h5py dataset first
    h5py_dir = args.h5py
    if h5py_dir is not None:
        h5py_dir = os.path.abspath(os.path.expanduser(h5py_dir))
    else:
        # Default location
        h5py_dir = os.path.join(output_dir, 'h5py')
    if os.path.exists(h5py_dir):
        if not os.path.isdir(h5py_dir):
            exit('h5py dataset location %s is not a directory. Abort.' % h5py_dir)
    log_filename = os.path.join(output_dir, log_filename)
    # Setup Logging as early as possible
    logging.basicConfig(level=logging.DEBUG,
                        format='[%(asctime)s] %(name)s:%(levelname)s:%(message)s',
                        handlers=[logging.FileHandler(log_filename),
                                  logging.StreamHandler()])
    if not CASASFuel.files_exist(h5py_dir):
        # Finish check and creating all directory needed - now load datasets
        if casas_data_dir is not None:
            casas_data = CASASData(path=casas_data_dir)
            casas_data.summary()
            # SVM needs to use statistical feature with per-sensor and normalization
            casas_data.populate_feature(method='stat', normalized=False, per_sensor=False)
            casas_data.export_hdf5(h5py_dir)
    casas_fuel = CASASFuel(dir_name=h5py_dir)
    # Prepare learning result
    result_pkl_file = os.path.join(output_dir, 'result.pkl')
    result = None
    if os.path.isfile(result_pkl_file):
        f = open(result_pkl_file, 'rb')
        result = pickle.load(f)
        f.close()
        if result.data != h5py_dir:
            logger.error('Result pickle file found for different dataset %s' % result.data)
            exit('Cannot save learning result at %s' % result_pkl_file)
    else:
        result = LearningResult(name='DecisionTree', data=h5py_dir, mode='by_week')
    num_classes = casas_fuel.get_output_dims()
    # Open Fuel and get all splits
    split_list = casas_fuel.get_set_list()
    train_names = ('week 24', 'week 23', 'week 22', 'week 21')
    test_names = ('week 25', 'week 26', 'week 27', 'week 28')
    test_name = 'single_test'
    train_set = casas_fuel.get_dataset(train_names, load_in_memory=True)
    (train_set_data) = train_set.data_sources
    test_set = casas_fuel.get_dataset(test_names, load_in_memory=True)
    (test_set_data) = test_set.data_sources
    # Prepare Back Annotation
    fp_back_annotated = open(os.path.join(output_dir, 'back_annotated.txt'), 'w')
    fp_back_probability = open(os.path.join(output_dir, 'back_annotated_proba.txt'), 'w')
    # run svm
    logger.info('Training on %s, Testing on %s' % (str(train_names), str(test_names)))
    if result.get_record_by_key(test_name) is None:
        prediction, prediction_proba = training_and_test(test_name, train_set_data, test_set_data, num_classes, result)
    else:
        prediction, prediction_proba = load_and_test(test_name, test_set_data, num_classes, result)
    casas_fuel.back_annotate(fp_back_annotated, prediction=prediction, split_name=test_names)
    casas_fuel.back_annotate_with_proba(fp_back_probability, prediction_proba=prediction_proba, split_name=test_names)
    train_name = test_name
    train_set_data = test_set_data
    f = open(result_pkl_file, 'wb')
    pickle.dump(obj=result, file=f, protocol=pickle.HIGHEST_PROTOCOL)
    f.close()
    result.export_to_xlsx(os.path.join(output_dir, 'result.xlsx'))



1		import os
2		import pickle
3		import logging
4		import argparse
5		from sklearn.ensemble import RandomForestClassifier
6		from datetime import datetime
7		from pyActLearn.CASAS.data import CASASData
8		from pyActLearn.CASAS.fuel import CASASFuel
9		from pyActLearn.performance.record import LearningResult
10		from pyActLearn.performance import get_confusion_matrix
11
12		logger = logging.getLogger(__file__)
13
14
15		def training_and_test(token, train_data, test_data, num_classes, result):
16		"""Train and test
17
18		Args:
19		token (:obj:`str`): token representing this run
20		train_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of training feature and label
21		test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label
22		num_classes (:obj:`int`): Number of classes
23		result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result
24		"""
25		model = RandomForestClassifier(n_estimators=20, criterion="entropy")
26		model.fit(train_data[0], train_data[1].flatten())
27		# Test
28		predicted_y = model.predict(test_data[0])
29		predicted_proba = model.predict_proba(test_data[0])
30		# Evaluate the Test and Store Result
31		confusion_matrix = get_confusion_matrix(num_classes=num_classes,
32		label=test_data[1].flatten(), predicted=predicted_y)
33		result.add_record(model.get_params(), key=token, confusion_matrix=confusion_matrix)
34		# In case any label is missing, populate it
35		if predicted_proba.shape[1] != num_classes:
36		temp_array = np.zeros((predicted_proba.shape[0], num_classes), np.float32)
37		for i in range(len(model.classes_)):
38		temp_array[:, model.classes_[i]] = predicted_proba[:, i]
39		predicted_proba = temp_array
40		return predicted_y, predicted_proba
41
42
43		def load_and_test(token, test_data, num_classes, result):
44		"""Load and test
45
46		Args:
47		token (:obj:`str`): token representing this run
48		test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label
49		num_classes (:obj:`int`): Number of classes
50		result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result
51		"""
52		model = RandomForestClassifier(n_estimators=20, criterion="entropy")
53		model.set_params(result.get_record_by_key(token)['model'])
54		# Test
55		predicted_y = model.predict(test_data[0])
56		predicted_proba = model.predict_proba(test_data[0])
57		return predicted_y, predicted_proba
58
59	View Code Duplication	if __name__ == '__main__':
		0 ignored issues – show Duplication introduced 2017-02-08 05:31 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
60		args_ok = False
61		parser = argparse.ArgumentParser(description='Run Decision Tree on single resident CASAS datasets.')
62		parser.add_argument('-d', '--dataset', help='Directory to original datasets')
63		parser.add_argument('-o', '--output', help='Output folder')
64		parser.add_argument('--h5py', help='HDF5 dataset folder')
65		args = parser.parse_args()
66		# Default parameters
67		log_filename = os.path.basename(__file__).split('.')[0] + \
68		'-%s.log' % datetime.now().strftime('%y%m%d_%H:%M:%S')
69		# Setup output directory
70		output_dir = args.output
71		if output_dir is not None:
72		output_dir = os.path.abspath(os.path.expanduser(output_dir))
73		if os.path.exists(output_dir):
74		# Found output_dir, check if it is a directory
75		if not os.path.isdir(output_dir):
76		exit('Output directory %s is found, but not a directory. Abort.' % output_dir)
77		else:
78		# Create directory
79		os.makedirs(output_dir)
80		else:
81		output_dir = '.'
82		# If dataset is specified, update h5py
83		casas_data_dir = args.dataset
84		if casas_data_dir is not None:
85		casas_data_dir = os.path.abspath(os.path.expanduser(casas_data_dir))
86		if not os.path.isdir(casas_data_dir):
87		exit('CASAS dataset at %s does not exist. Abort.' % casas_data_dir)
88		# Find h5py dataset first
89		h5py_dir = args.h5py
90		if h5py_dir is not None:
91		h5py_dir = os.path.abspath(os.path.expanduser(h5py_dir))
92		else:
93		# Default location
94		h5py_dir = os.path.join(output_dir, 'h5py')
95		if os.path.exists(h5py_dir):
96		if not os.path.isdir(h5py_dir):
97		exit('h5py dataset location %s is not a directory. Abort.' % h5py_dir)
98		log_filename = os.path.join(output_dir, log_filename)
99		# Setup Logging as early as possible
100		logging.basicConfig(level=logging.DEBUG,
101		format='[%(asctime)s] %(name)s:%(levelname)s:%(message)s',
102		handlers=[logging.FileHandler(log_filename),
103		logging.StreamHandler()])
104		if not CASASFuel.files_exist(h5py_dir):
105		# Finish check and creating all directory needed - now load datasets
106		if casas_data_dir is not None:
107		casas_data = CASASData(path=casas_data_dir)
108		casas_data.summary()
109		# SVM needs to use statistical feature with per-sensor and normalization
110		casas_data.populate_feature(method='stat', normalized=False, per_sensor=False)
111		casas_data.export_hdf5(h5py_dir)
112		casas_fuel = CASASFuel(dir_name=h5py_dir)
113		# Prepare learning result
114		result_pkl_file = os.path.join(output_dir, 'result.pkl')
115		result = None
116		if os.path.isfile(result_pkl_file):
117		f = open(result_pkl_file, 'rb')
118		result = pickle.load(f)
119		f.close()
120		if result.data != h5py_dir:
121		logger.error('Result pickle file found for different dataset %s' % result.data)
122		exit('Cannot save learning result at %s' % result_pkl_file)
123		else:
124		result = LearningResult(name='DecisionTree', data=h5py_dir, mode='by_week')
125		num_classes = casas_fuel.get_output_dims()
126		# Open Fuel and get all splits
127		split_list = casas_fuel.get_set_list()
128		train_names = ('week 24', 'week 23', 'week 22', 'week 21')
129		test_names = ('week 25', 'week 26', 'week 27', 'week 28')
130		test_name = 'single_test'
131		train_set = casas_fuel.get_dataset(train_names, load_in_memory=True)
132		(train_set_data) = train_set.data_sources
133		test_set = casas_fuel.get_dataset(test_names, load_in_memory=True)
134		(test_set_data) = test_set.data_sources
135		# Prepare Back Annotation
136		fp_back_annotated = open(os.path.join(output_dir, 'back_annotated.txt'), 'w')
137		fp_back_probability = open(os.path.join(output_dir, 'back_annotated_proba.txt'), 'w')
138		# run svm
139		logger.info('Training on %s, Testing on %s' % (str(train_names), str(test_names)))
140		if result.get_record_by_key(test_name) is None:
141		prediction, prediction_proba = training_and_test(test_name, train_set_data, test_set_data, num_classes, result)
142		else:
143		prediction, prediction_proba = load_and_test(test_name, test_set_data, num_classes, result)
144		casas_fuel.back_annotate(fp_back_annotated, prediction=prediction, split_name=test_names)
145		casas_fuel.back_annotate_with_proba(fp_back_probability, prediction_proba=prediction_proba, split_name=test_names)
146		train_name = test_name
147		train_set_data = test_set_data
148		f = open(result_pkl_file, 'wb')
149		pickle.dump(obj=result, file=f, protocol=pickle.HIGHEST_PROTOCOL)
150		f.close()
151		result.export_to_xlsx(os.path.join(output_dir, 'result.xlsx'))
152
153

TinghuiWang / pyActLearn

Push — master ( 2384cb...f17ea4 )

training_and_test() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like